blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 from typing import (
  31     Callable,
  32     Iterable,
  33     Iterator,
  34     List,
  35     Optional,
  36     Text,
  37     Tuple,
  38     Pattern,
  39     Union,
  40     cast,
  41 )
  42 from blib2to3.pgen2.token import *
  43 from blib2to3.pgen2.grammar import Grammar
  44
  45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  47
  48 import regex as re
  49 from codecs import BOM_UTF8, lookup
  50 from blib2to3.pgen2.token import *
  51
  52 from . import token
  53
  54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  55     "tokenize",
  56     "generate_tokens",
  57     "untokenize",
  58 ]
  59 del token
  60
  61
  62 def group(*choices):
  63     return "(" + "|".join(choices) + ")"
  64
  65
  66 def any(*choices):
  67     return group(*choices) + "*"
  68
  69
  70 def maybe(*choices):
  71     return group(*choices) + "?"
  72
  73
  74 def _combinations(*l):
  75     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  76
  77
  78 Whitespace = r"[ \f\t]*"
  79 Comment = r"#[^\r\n]*"
  80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  81 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
  82     r"\w+"
  83 )
  84
  85 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  86 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  87 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  88 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  89 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  90 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  91 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  92     Exponent
  93 )
  94 Expfloat = r"\d+(?:_\d+)*" + Exponent
  95 Floatnumber = group(Pointfloat, Expfloat)
  96 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
  97 Number = group(Imagnumber, Floatnumber, Intnumber)
  98
  99 # Tail end of ' string.
 100 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 101 # Tail end of " string.
 102 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 103 # Tail end of ''' string.
 104 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 105 # Tail end of """ string.
 106 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 107 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 108 Triple = group(_litprefix + "'''", _litprefix + '"""')
 109 # Single-line ' or " string.
 110 String = group(
 111     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 112     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 113 )
 114
 115 # Because of leftmost-then-longest match semantics, be sure to put the
 116 # longest operators first (e.g., if = came before ==, == would get
 117 # recognized as two instances of =).
 118 Operator = group(
 119     r"\*\*=?",
 120     r">>=?",
 121     r"<<=?",
 122     r"<>",
 123     r"!=",
 124     r"//=?",
 125     r"->",
 126     r"[+\-*/%&@|^=<>:]=?",
 127     r"~",
 128 )
 129
 130 Bracket = "[][(){}]"
 131 Special = group(r"\r?\n", r"[:;.,`@]")
 132 Funny = group(Operator, Bracket, Special)
 133
 134 # First (or only) line of ' or " string.
 135 ContStr = group(
 136     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 137     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 138 )
 139 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 140 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 141
 142 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 143 single3prog = re.compile(Single3)
 144 double3prog = re.compile(Double3)
 145
 146 _strprefixes = (
 147     _combinations("r", "R", "f", "F")
 148     | _combinations("r", "R", "b", "B")
 149     | {"u", "U", "ur", "uR", "Ur", "UR"}
 150 )
 151
 152 endprogs = {
 153     "'": re.compile(Single),
 154     '"': re.compile(Double),
 155     "'''": single3prog,
 156     '"""': double3prog,
 157     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 158     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 159     **{prefix: None for prefix in _strprefixes},
 160 }
 161
 162 triple_quoted = (
 163     {"'''", '"""'}
 164     | {f"{prefix}'''" for prefix in _strprefixes}
 165     | {f'{prefix}"""' for prefix in _strprefixes}
 166 )
 167 single_quoted = (
 168     {"'", '"'}
 169     | {f"{prefix}'" for prefix in _strprefixes}
 170     | {f'{prefix}"' for prefix in _strprefixes}
 171 )
 172
 173 tabsize = 8
 174
 175
 176 class TokenError(Exception):
 177     pass
 178
 179
 180 class StopTokenizing(Exception):
 181     pass
 182
 183
 184 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 185     (srow, scol) = xxx_todo_changeme
 186     (erow, ecol) = xxx_todo_changeme1
 187     print(
 188         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 189     )
 190
 191
 192 Coord = Tuple[int, int]
 193 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
 194
 195
 196 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
 197     """
 198     The tokenize() function accepts two parameters: one representing the
 199     input stream, and one providing an output mechanism for tokenize().
 200
 201     The first parameter, readline, must be a callable object which provides
 202     the same interface as the readline() method of built-in file objects.
 203     Each call to the function should return one line of input as a string.
 204
 205     The second parameter, tokeneater, must also be a callable object. It is
 206     called once for each token, with five arguments, corresponding to the
 207     tuples generated by generate_tokens().
 208     """
 209     try:
 210         tokenize_loop(readline, tokeneater)
 211     except StopTokenizing:
 212         pass
 213
 214
 215 # backwards compatible interface
 216 def tokenize_loop(readline, tokeneater):
 217     for token_info in generate_tokens(readline):
 218         tokeneater(*token_info)
 219
 220
 221 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
 222 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 223
 224
 225 class Untokenizer:
 226
 227     tokens: List[Text]
 228     prev_row: int
 229     prev_col: int
 230
 231     def __init__(self) -> None:
 232         self.tokens = []
 233         self.prev_row = 1
 234         self.prev_col = 0
 235
 236     def add_whitespace(self, start: Coord) -> None:
 237         row, col = start
 238         assert row <= self.prev_row
 239         col_offset = col - self.prev_col
 240         if col_offset:
 241             self.tokens.append(" " * col_offset)
 242
 243     def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
 244         for t in iterable:
 245             if len(t) == 2:
 246                 self.compat(cast(Tuple[int, str], t), iterable)
 247                 break
 248             tok_type, token, start, end, line = cast(
 249                 Tuple[int, Text, Coord, Coord, Text], t
 250             )
 251             self.add_whitespace(start)
 252             self.tokens.append(token)
 253             self.prev_row, self.prev_col = end
 254             if tok_type in (NEWLINE, NL):
 255                 self.prev_row += 1
 256                 self.prev_col = 0
 257         return "".join(self.tokens)
 258
 259     def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
 260         startline = False
 261         indents = []
 262         toks_append = self.tokens.append
 263         toknum, tokval = token
 264         if toknum in (NAME, NUMBER):
 265             tokval += " "
 266         if toknum in (NEWLINE, NL):
 267             startline = True
 268         for tok in iterable:
 269             toknum, tokval = tok[:2]
 270
 271             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 272                 tokval += " "
 273
 274             if toknum == INDENT:
 275                 indents.append(tokval)
 276                 continue
 277             elif toknum == DEDENT:
 278                 indents.pop()
 279                 continue
 280             elif toknum in (NEWLINE, NL):
 281                 startline = True
 282             elif startline and indents:
 283                 toks_append(indents[-1])
 284                 startline = False
 285             toks_append(tokval)
 286
 287
 288 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 289 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 290
 291
 292 def _get_normal_name(orig_enc: str) -> str:
 293     """Imitates get_normal_name in tokenizer.c."""
 294     # Only care about the first 12 characters.
 295     enc = orig_enc[:12].lower().replace("_", "-")
 296     if enc == "utf-8" or enc.startswith("utf-8-"):
 297         return "utf-8"
 298     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 299         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 300     ):
 301         return "iso-8859-1"
 302     return orig_enc
 303
 304
 305 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 306     """
 307     The detect_encoding() function is used to detect the encoding that should
 308     be used to decode a Python source file. It requires one argument, readline,
 309     in the same way as the tokenize() generator.
 310
 311     It will call readline a maximum of twice, and return the encoding used
 312     (as a string) and a list of any lines (left as bytes) it has read
 313     in.
 314
 315     It detects the encoding from the presence of a utf-8 bom or an encoding
 316     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 317     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 318     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 319     'utf-8-sig' is returned.
 320
 321     If no encoding is specified, then the default of 'utf-8' will be returned.
 322     """
 323     bom_found = False
 324     encoding = None
 325     default = "utf-8"
 326
 327     def read_or_stop() -> bytes:
 328         try:
 329             return readline()
 330         except StopIteration:
 331             return bytes()
 332
 333     def find_cookie(line: bytes) -> Optional[str]:
 334         try:
 335             line_string = line.decode("ascii")
 336         except UnicodeDecodeError:
 337             return None
 338         match = cookie_re.match(line_string)
 339         if not match:
 340             return None
 341         encoding = _get_normal_name(match.group(1))
 342         try:
 343             codec = lookup(encoding)
 344         except LookupError:
 345             # This behaviour mimics the Python interpreter
 346             raise SyntaxError("unknown encoding: " + encoding)
 347
 348         if bom_found:
 349             if codec.name != "utf-8":
 350                 # This behaviour mimics the Python interpreter
 351                 raise SyntaxError("encoding problem: utf-8")
 352             encoding += "-sig"
 353         return encoding
 354
 355     first = read_or_stop()
 356     if first.startswith(BOM_UTF8):
 357         bom_found = True
 358         first = first[3:]
 359         default = "utf-8-sig"
 360     if not first:
 361         return default, []
 362
 363     encoding = find_cookie(first)
 364     if encoding:
 365         return encoding, [first]
 366     if not blank_re.match(first):
 367         return default, [first]
 368
 369     second = read_or_stop()
 370     if not second:
 371         return default, [first]
 372
 373     encoding = find_cookie(second)
 374     if encoding:
 375         return encoding, [first, second]
 376
 377     return default, [first, second]
 378
 379
 380 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 381     """Transform tokens back into Python source code.
 382
 383     Each element returned by the iterable must be a token sequence
 384     with at least two elements, a token number and token value.  If
 385     only two tokens are passed, the resulting output is poor.
 386
 387     Round-trip invariant for full input:
 388         Untokenized source will match input source exactly
 389
 390     Round-trip invariant for limited input:
 391         # Output text will tokenize the back to the input
 392         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 393         newcode = untokenize(t1)
 394         readline = iter(newcode.splitlines(1)).next
 395         t2 = [tok[:2] for tokin generate_tokens(readline)]
 396         assert t1 == t2
 397     """
 398     ut = Untokenizer()
 399     return ut.untokenize(iterable)
 400
 401
 402 def generate_tokens(
 403     readline: Callable[[], Text], grammar: Optional[Grammar] = None
 404 ) -> Iterator[GoodTokenInfo]:
 405     """
 406     The generate_tokens() generator requires one argument, readline, which
 407     must be a callable object which provides the same interface as the
 408     readline() method of built-in file objects. Each call to the function
 409     should return one line of input as a string.  Alternately, readline
 410     can be a callable function terminating with StopIteration:
 411         readline = open(myfile).next    # Example of alternate readline
 412
 413     The generator produces 5-tuples with these members: the token type; the
 414     token string; a 2-tuple (srow, scol) of ints specifying the row and
 415     column where the token begins in the source; a 2-tuple (erow, ecol) of
 416     ints specifying the row and column where the token ends in the source;
 417     and the line on which the token was found. The line passed is the
 418     logical line; continuation lines are included.
 419     """
 420     lnum = parenlev = continued = 0
 421     numchars = "0123456789"
 422     contstr, needcont = "", 0
 423     contline: Optional[str] = None
 424     indents = [0]
 425
 426     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 427     # `await` as keywords.
 428     async_keywords = False if grammar is None else grammar.async_keywords
 429     # 'stashed' and 'async_*' are used for async/await parsing
 430     stashed = None
 431     async_def = False
 432     async_def_indent = 0
 433     async_def_nl = False
 434
 435     strstart: Tuple[int, int]
 436     endprog: Pattern[str]
 437
 438     while 1:  # loop over lines in stream
 439         try:
 440             line = readline()
 441         except StopIteration:
 442             line = ""
 443         lnum = lnum + 1
 444         pos, max = 0, len(line)
 445
 446         if contstr:  # continued string
 447             assert contline is not None
 448             if not line:
 449                 raise TokenError("EOF in multi-line string", strstart)
 450             endmatch = endprog.match(line)
 451             if endmatch:
 452                 pos = end = endmatch.end(0)
 453                 yield (
 454                     STRING,
 455                     contstr + line[:end],
 456                     strstart,
 457                     (lnum, end),
 458                     contline + line,
 459                 )
 460                 contstr, needcont = "", 0
 461                 contline = None
 462             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 463                 yield (
 464                     ERRORTOKEN,
 465                     contstr + line,
 466                     strstart,
 467                     (lnum, len(line)),
 468                     contline,
 469                 )
 470                 contstr = ""
 471                 contline = None
 472                 continue
 473             else:
 474                 contstr = contstr + line
 475                 contline = contline + line
 476                 continue
 477
 478         elif parenlev == 0 and not continued:  # new statement
 479             if not line:
 480                 break
 481             column = 0
 482             while pos < max:  # measure leading whitespace
 483                 if line[pos] == " ":
 484                     column = column + 1
 485                 elif line[pos] == "\t":
 486                     column = (column // tabsize + 1) * tabsize
 487                 elif line[pos] == "\f":
 488                     column = 0
 489                 else:
 490                     break
 491                 pos = pos + 1
 492             if pos == max:
 493                 break
 494
 495             if stashed:
 496                 yield stashed
 497                 stashed = None
 498
 499             if line[pos] in "\r\n":  # skip blank lines
 500                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 501                 continue
 502
 503             if line[pos] == "#":  # skip comments
 504                 comment_token = line[pos:].rstrip("\r\n")
 505                 nl_pos = pos + len(comment_token)
 506                 yield (
 507                     COMMENT,
 508                     comment_token,
 509                     (lnum, pos),
 510                     (lnum, pos + len(comment_token)),
 511                     line,
 512                 )
 513                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 514                 continue
 515
 516             if column > indents[-1]:  # count indents
 517                 indents.append(column)
 518                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 519
 520             while column < indents[-1]:  # count dedents
 521                 if column not in indents:
 522                     raise IndentationError(
 523                         "unindent does not match any outer indentation level",
 524                         ("<tokenize>", lnum, pos, line),
 525                     )
 526                 indents = indents[:-1]
 527
 528                 if async_def and async_def_indent >= indents[-1]:
 529                     async_def = False
 530                     async_def_nl = False
 531                     async_def_indent = 0
 532
 533                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 534
 535             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 536                 async_def = False
 537                 async_def_nl = False
 538                 async_def_indent = 0
 539
 540         else:  # continued statement
 541             if not line:
 542                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 543             continued = 0
 544
 545         while pos < max:
 546             pseudomatch = pseudoprog.match(line, pos)
 547             if pseudomatch:  # scan for tokens
 548                 start, end = pseudomatch.span(1)
 549                 spos, epos, pos = (lnum, start), (lnum, end), end
 550                 token, initial = line[start:end], line[start]
 551
 552                 if initial in numchars or (
 553                     initial == "." and token != "."
 554                 ):  # ordinary number
 555                     yield (NUMBER, token, spos, epos, line)
 556                 elif initial in "\r\n":
 557                     newline = NEWLINE
 558                     if parenlev > 0:
 559                         newline = NL
 560                     elif async_def:
 561                         async_def_nl = True
 562                     if stashed:
 563                         yield stashed
 564                         stashed = None
 565                     yield (newline, token, spos, epos, line)
 566
 567                 elif initial == "#":
 568                     assert not token.endswith("\n")
 569                     if stashed:
 570                         yield stashed
 571                         stashed = None
 572                     yield (COMMENT, token, spos, epos, line)
 573                 elif token in triple_quoted:
 574                     endprog = endprogs[token]
 575                     endmatch = endprog.match(line, pos)
 576                     if endmatch:  # all on one line
 577                         pos = endmatch.end(0)
 578                         token = line[start:pos]
 579                         if stashed:
 580                             yield stashed
 581                             stashed = None
 582                         yield (STRING, token, spos, (lnum, pos), line)
 583                     else:
 584                         strstart = (lnum, start)  # multiple lines
 585                         contstr = line[start:]
 586                         contline = line
 587                         break
 588                 elif (
 589                     initial in single_quoted
 590                     or token[:2] in single_quoted
 591                     or token[:3] in single_quoted
 592                 ):
 593                     if token[-1] == "\n":  # continued string
 594                         strstart = (lnum, start)
 595                         endprog = (
 596                             endprogs[initial]
 597                             or endprogs[token[1]]
 598                             or endprogs[token[2]]
 599                         )
 600                         contstr, needcont = line[start:], 1
 601                         contline = line
 602                         break
 603                     else:  # ordinary string
 604                         if stashed:
 605                             yield stashed
 606                             stashed = None
 607                         yield (STRING, token, spos, epos, line)
 608                 elif initial.isidentifier():  # ordinary name
 609                     if token in ("async", "await"):
 610                         if async_keywords or async_def:
 611                             yield (
 612                                 ASYNC if token == "async" else AWAIT,
 613                                 token,
 614                                 spos,
 615                                 epos,
 616                                 line,
 617                             )
 618                             continue
 619
 620                     tok = (NAME, token, spos, epos, line)
 621                     if token == "async" and not stashed:
 622                         stashed = tok
 623                         continue
 624
 625                     if token in ("def", "for"):
 626                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 627
 628                             if token == "def":
 629                                 async_def = True
 630                                 async_def_indent = indents[-1]
 631
 632                             yield (
 633                                 ASYNC,
 634                                 stashed[1],
 635                                 stashed[2],
 636                                 stashed[3],
 637                                 stashed[4],
 638                             )
 639                             stashed = None
 640
 641                     if stashed:
 642                         yield stashed
 643                         stashed = None
 644
 645                     yield tok
 646                 elif initial == "\\":  # continued stmt
 647                     # This yield is new; needed for better idempotency:
 648                     if stashed:
 649                         yield stashed
 650                         stashed = None
 651                     yield (NL, token, spos, (lnum, pos), line)
 652                     continued = 1
 653                 else:
 654                     if initial in "([{":
 655                         parenlev = parenlev + 1
 656                     elif initial in ")]}":
 657                         parenlev = parenlev - 1
 658                     if stashed:
 659                         yield stashed
 660                         stashed = None
 661                     yield (OP, token, spos, epos, line)
 662             else:
 663                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 664                 pos = pos + 1
 665
 666     if stashed:
 667         yield stashed
 668         stashed = None
 669
 670     for indent in indents[1:]:  # pop remaining indent levels
 671         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 672     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 673
 674
 675 if __name__ == "__main__":  # testing
 676     import sys
 677
 678     if len(sys.argv) > 1:
 679         tokenize(open(sys.argv[1]).readline)
 680     else:
 681         tokenize(sys.stdin.readline)