blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 from typing import (
  31     Callable,
  32     Iterable,
  33     Iterator,
  34     List,
  35     Optional,
  36     Text,
  37     Tuple,
  38     Pattern,
  39     Union,
  40     cast,
  41 )
  42 from blib2to3.pgen2.token import *
  43 from blib2to3.pgen2.grammar import Grammar
  44
  45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  47
  48 import regex as re
  49 from codecs import BOM_UTF8, lookup
  50 from blib2to3.pgen2.token import *
  51
  52 from . import token
  53
  54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  55     "tokenize",
  56     "generate_tokens",
  57     "untokenize",
  58 ]
  59 del token
  60
  61
  62 def group(*choices):
  63     return "(" + "|".join(choices) + ")"
  64
  65
  66 def any(*choices):
  67     return group(*choices) + "*"
  68
  69
  70 def maybe(*choices):
  71     return group(*choices) + "?"
  72
  73
  74 def _combinations(*l):
  75     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  76
  77
  78 Whitespace = r"[ \f\t]*"
  79 Comment = r"#[^\r\n]*"
  80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  81 Name = r"\w+"  # this is invalid but it's fine because Name comes after Number in all groups
  82
  83 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  84 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  85 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  86 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  87 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  88 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  89 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  90     Exponent
  91 )
  92 Expfloat = r"\d+(?:_\d+)*" + Exponent
  93 Floatnumber = group(Pointfloat, Expfloat)
  94 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
  95 Number = group(Imagnumber, Floatnumber, Intnumber)
  96
  97 # Tail end of ' string.
  98 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  99 # Tail end of " string.
 100 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 101 # Tail end of ''' string.
 102 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 103 # Tail end of """ string.
 104 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 105 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 106 Triple = group(_litprefix + "'''", _litprefix + '"""')
 107 # Single-line ' or " string.
 108 String = group(
 109     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 110     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 111 )
 112
 113 # Because of leftmost-then-longest match semantics, be sure to put the
 114 # longest operators first (e.g., if = came before ==, == would get
 115 # recognized as two instances of =).
 116 Operator = group(
 117     r"\*\*=?",
 118     r">>=?",
 119     r"<<=?",
 120     r"<>",
 121     r"!=",
 122     r"//=?",
 123     r"->",
 124     r"[+\-*/%&@|^=<>:]=?",
 125     r"~",
 126 )
 127
 128 Bracket = "[][(){}]"
 129 Special = group(r"\r?\n", r"[:;.,`@]")
 130 Funny = group(Operator, Bracket, Special)
 131
 132 # First (or only) line of ' or " string.
 133 ContStr = group(
 134     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 135     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 136 )
 137 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 138 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 139
 140 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 141 single3prog = re.compile(Single3)
 142 double3prog = re.compile(Double3)
 143
 144 _strprefixes = (
 145     _combinations("r", "R", "f", "F")
 146     | _combinations("r", "R", "b", "B")
 147     | {"u", "U", "ur", "uR", "Ur", "UR"}
 148 )
 149
 150 endprogs = {
 151     "'": re.compile(Single),
 152     '"': re.compile(Double),
 153     "'''": single3prog,
 154     '"""': double3prog,
 155     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 156     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 157     **{prefix: None for prefix in _strprefixes},
 158 }
 159
 160 triple_quoted = (
 161     {"'''", '"""'}
 162     | {f"{prefix}'''" for prefix in _strprefixes}
 163     | {f'{prefix}"""' for prefix in _strprefixes}
 164 )
 165 single_quoted = (
 166     {"'", '"'}
 167     | {f"{prefix}'" for prefix in _strprefixes}
 168     | {f'{prefix}"' for prefix in _strprefixes}
 169 )
 170
 171 tabsize = 8
 172
 173
 174 class TokenError(Exception):
 175     pass
 176
 177
 178 class StopTokenizing(Exception):
 179     pass
 180
 181
 182 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 183     (srow, scol) = xxx_todo_changeme
 184     (erow, ecol) = xxx_todo_changeme1
 185     print(
 186         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 187     )
 188
 189
 190 Coord = Tuple[int, int]
 191 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
 192
 193
 194 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
 195     """
 196     The tokenize() function accepts two parameters: one representing the
 197     input stream, and one providing an output mechanism for tokenize().
 198
 199     The first parameter, readline, must be a callable object which provides
 200     the same interface as the readline() method of built-in file objects.
 201     Each call to the function should return one line of input as a string.
 202
 203     The second parameter, tokeneater, must also be a callable object. It is
 204     called once for each token, with five arguments, corresponding to the
 205     tuples generated by generate_tokens().
 206     """
 207     try:
 208         tokenize_loop(readline, tokeneater)
 209     except StopTokenizing:
 210         pass
 211
 212
 213 # backwards compatible interface
 214 def tokenize_loop(readline, tokeneater):
 215     for token_info in generate_tokens(readline):
 216         tokeneater(*token_info)
 217
 218
 219 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
 220 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 221
 222
 223 class Untokenizer:
 224
 225     tokens: List[Text]
 226     prev_row: int
 227     prev_col: int
 228
 229     def __init__(self) -> None:
 230         self.tokens = []
 231         self.prev_row = 1
 232         self.prev_col = 0
 233
 234     def add_whitespace(self, start: Coord) -> None:
 235         row, col = start
 236         assert row <= self.prev_row
 237         col_offset = col - self.prev_col
 238         if col_offset:
 239             self.tokens.append(" " * col_offset)
 240
 241     def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
 242         for t in iterable:
 243             if len(t) == 2:
 244                 self.compat(cast(Tuple[int, str], t), iterable)
 245                 break
 246             tok_type, token, start, end, line = cast(
 247                 Tuple[int, Text, Coord, Coord, Text], t
 248             )
 249             self.add_whitespace(start)
 250             self.tokens.append(token)
 251             self.prev_row, self.prev_col = end
 252             if tok_type in (NEWLINE, NL):
 253                 self.prev_row += 1
 254                 self.prev_col = 0
 255         return "".join(self.tokens)
 256
 257     def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
 258         startline = False
 259         indents = []
 260         toks_append = self.tokens.append
 261         toknum, tokval = token
 262         if toknum in (NAME, NUMBER):
 263             tokval += " "
 264         if toknum in (NEWLINE, NL):
 265             startline = True
 266         for tok in iterable:
 267             toknum, tokval = tok[:2]
 268
 269             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 270                 tokval += " "
 271
 272             if toknum == INDENT:
 273                 indents.append(tokval)
 274                 continue
 275             elif toknum == DEDENT:
 276                 indents.pop()
 277                 continue
 278             elif toknum in (NEWLINE, NL):
 279                 startline = True
 280             elif startline and indents:
 281                 toks_append(indents[-1])
 282                 startline = False
 283             toks_append(tokval)
 284
 285
 286 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 287 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 288
 289
 290 def _get_normal_name(orig_enc: str) -> str:
 291     """Imitates get_normal_name in tokenizer.c."""
 292     # Only care about the first 12 characters.
 293     enc = orig_enc[:12].lower().replace("_", "-")
 294     if enc == "utf-8" or enc.startswith("utf-8-"):
 295         return "utf-8"
 296     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 297         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 298     ):
 299         return "iso-8859-1"
 300     return orig_enc
 301
 302
 303 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 304     """
 305     The detect_encoding() function is used to detect the encoding that should
 306     be used to decode a Python source file. It requires one argument, readline,
 307     in the same way as the tokenize() generator.
 308
 309     It will call readline a maximum of twice, and return the encoding used
 310     (as a string) and a list of any lines (left as bytes) it has read
 311     in.
 312
 313     It detects the encoding from the presence of a utf-8 bom or an encoding
 314     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 315     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 316     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 317     'utf-8-sig' is returned.
 318
 319     If no encoding is specified, then the default of 'utf-8' will be returned.
 320     """
 321     bom_found = False
 322     encoding = None
 323     default = "utf-8"
 324
 325     def read_or_stop() -> bytes:
 326         try:
 327             return readline()
 328         except StopIteration:
 329             return bytes()
 330
 331     def find_cookie(line: bytes) -> Optional[str]:
 332         try:
 333             line_string = line.decode("ascii")
 334         except UnicodeDecodeError:
 335             return None
 336         match = cookie_re.match(line_string)
 337         if not match:
 338             return None
 339         encoding = _get_normal_name(match.group(1))
 340         try:
 341             codec = lookup(encoding)
 342         except LookupError:
 343             # This behaviour mimics the Python interpreter
 344             raise SyntaxError("unknown encoding: " + encoding)
 345
 346         if bom_found:
 347             if codec.name != "utf-8":
 348                 # This behaviour mimics the Python interpreter
 349                 raise SyntaxError("encoding problem: utf-8")
 350             encoding += "-sig"
 351         return encoding
 352
 353     first = read_or_stop()
 354     if first.startswith(BOM_UTF8):
 355         bom_found = True
 356         first = first[3:]
 357         default = "utf-8-sig"
 358     if not first:
 359         return default, []
 360
 361     encoding = find_cookie(first)
 362     if encoding:
 363         return encoding, [first]
 364     if not blank_re.match(first):
 365         return default, [first]
 366
 367     second = read_or_stop()
 368     if not second:
 369         return default, [first]
 370
 371     encoding = find_cookie(second)
 372     if encoding:
 373         return encoding, [first, second]
 374
 375     return default, [first, second]
 376
 377
 378 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 379     """Transform tokens back into Python source code.
 380
 381     Each element returned by the iterable must be a token sequence
 382     with at least two elements, a token number and token value.  If
 383     only two tokens are passed, the resulting output is poor.
 384
 385     Round-trip invariant for full input:
 386         Untokenized source will match input source exactly
 387
 388     Round-trip invariant for limited input:
 389         # Output text will tokenize the back to the input
 390         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 391         newcode = untokenize(t1)
 392         readline = iter(newcode.splitlines(1)).next
 393         t2 = [tok[:2] for tokin generate_tokens(readline)]
 394         assert t1 == t2
 395     """
 396     ut = Untokenizer()
 397     return ut.untokenize(iterable)
 398
 399
 400 def generate_tokens(
 401     readline: Callable[[], Text], grammar: Optional[Grammar] = None
 402 ) -> Iterator[GoodTokenInfo]:
 403     """
 404     The generate_tokens() generator requires one argument, readline, which
 405     must be a callable object which provides the same interface as the
 406     readline() method of built-in file objects. Each call to the function
 407     should return one line of input as a string.  Alternately, readline
 408     can be a callable function terminating with StopIteration:
 409         readline = open(myfile).next    # Example of alternate readline
 410
 411     The generator produces 5-tuples with these members: the token type; the
 412     token string; a 2-tuple (srow, scol) of ints specifying the row and
 413     column where the token begins in the source; a 2-tuple (erow, ecol) of
 414     ints specifying the row and column where the token ends in the source;
 415     and the line on which the token was found. The line passed is the
 416     logical line; continuation lines are included.
 417     """
 418     lnum = parenlev = continued = 0
 419     numchars = "0123456789"
 420     contstr, needcont = "", 0
 421     contline: Optional[str] = None
 422     indents = [0]
 423
 424     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 425     # `await` as keywords.
 426     async_keywords = False if grammar is None else grammar.async_keywords
 427     # 'stashed' and 'async_*' are used for async/await parsing
 428     stashed = None
 429     async_def = False
 430     async_def_indent = 0
 431     async_def_nl = False
 432
 433     strstart: Tuple[int, int]
 434     endprog: Pattern[str]
 435
 436     while 1:  # loop over lines in stream
 437         try:
 438             line = readline()
 439         except StopIteration:
 440             line = ""
 441         lnum = lnum + 1
 442         pos, max = 0, len(line)
 443
 444         if contstr:  # continued string
 445             assert contline is not None
 446             if not line:
 447                 raise TokenError("EOF in multi-line string", strstart)
 448             endmatch = endprog.match(line)
 449             if endmatch:
 450                 pos = end = endmatch.end(0)
 451                 yield (
 452                     STRING,
 453                     contstr + line[:end],
 454                     strstart,
 455                     (lnum, end),
 456                     contline + line,
 457                 )
 458                 contstr, needcont = "", 0
 459                 contline = None
 460             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 461                 yield (
 462                     ERRORTOKEN,
 463                     contstr + line,
 464                     strstart,
 465                     (lnum, len(line)),
 466                     contline,
 467                 )
 468                 contstr = ""
 469                 contline = None
 470                 continue
 471             else:
 472                 contstr = contstr + line
 473                 contline = contline + line
 474                 continue
 475
 476         elif parenlev == 0 and not continued:  # new statement
 477             if not line:
 478                 break
 479             column = 0
 480             while pos < max:  # measure leading whitespace
 481                 if line[pos] == " ":
 482                     column = column + 1
 483                 elif line[pos] == "\t":
 484                     column = (column // tabsize + 1) * tabsize
 485                 elif line[pos] == "\f":
 486                     column = 0
 487                 else:
 488                     break
 489                 pos = pos + 1
 490             if pos == max:
 491                 break
 492
 493             if stashed:
 494                 yield stashed
 495                 stashed = None
 496
 497             if line[pos] in "\r\n":  # skip blank lines
 498                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 499                 continue
 500
 501             if line[pos] == "#":  # skip comments
 502                 comment_token = line[pos:].rstrip("\r\n")
 503                 nl_pos = pos + len(comment_token)
 504                 yield (
 505                     COMMENT,
 506                     comment_token,
 507                     (lnum, pos),
 508                     (lnum, pos + len(comment_token)),
 509                     line,
 510                 )
 511                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 512                 continue
 513
 514             if column > indents[-1]:  # count indents
 515                 indents.append(column)
 516                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 517
 518             while column < indents[-1]:  # count dedents
 519                 if column not in indents:
 520                     raise IndentationError(
 521                         "unindent does not match any outer indentation level",
 522                         ("<tokenize>", lnum, pos, line),
 523                     )
 524                 indents = indents[:-1]
 525
 526                 if async_def and async_def_indent >= indents[-1]:
 527                     async_def = False
 528                     async_def_nl = False
 529                     async_def_indent = 0
 530
 531                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 532
 533             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 534                 async_def = False
 535                 async_def_nl = False
 536                 async_def_indent = 0
 537
 538         else:  # continued statement
 539             if not line:
 540                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 541             continued = 0
 542
 543         while pos < max:
 544             pseudomatch = pseudoprog.match(line, pos)
 545             if pseudomatch:  # scan for tokens
 546                 start, end = pseudomatch.span(1)
 547                 spos, epos, pos = (lnum, start), (lnum, end), end
 548                 token, initial = line[start:end], line[start]
 549
 550                 if initial in numchars or (
 551                     initial == "." and token != "."
 552                 ):  # ordinary number
 553                     yield (NUMBER, token, spos, epos, line)
 554                 elif initial in "\r\n":
 555                     newline = NEWLINE
 556                     if parenlev > 0:
 557                         newline = NL
 558                     elif async_def:
 559                         async_def_nl = True
 560                     if stashed:
 561                         yield stashed
 562                         stashed = None
 563                     yield (newline, token, spos, epos, line)
 564
 565                 elif initial == "#":
 566                     assert not token.endswith("\n")
 567                     if stashed:
 568                         yield stashed
 569                         stashed = None
 570                     yield (COMMENT, token, spos, epos, line)
 571                 elif token in triple_quoted:
 572                     endprog = endprogs[token]
 573                     endmatch = endprog.match(line, pos)
 574                     if endmatch:  # all on one line
 575                         pos = endmatch.end(0)
 576                         token = line[start:pos]
 577                         if stashed:
 578                             yield stashed
 579                             stashed = None
 580                         yield (STRING, token, spos, (lnum, pos), line)
 581                     else:
 582                         strstart = (lnum, start)  # multiple lines
 583                         contstr = line[start:]
 584                         contline = line
 585                         break
 586                 elif (
 587                     initial in single_quoted
 588                     or token[:2] in single_quoted
 589                     or token[:3] in single_quoted
 590                 ):
 591                     if token[-1] == "\n":  # continued string
 592                         strstart = (lnum, start)
 593                         endprog = (
 594                             endprogs[initial]
 595                             or endprogs[token[1]]
 596                             or endprogs[token[2]]
 597                         )
 598                         contstr, needcont = line[start:], 1
 599                         contline = line
 600                         break
 601                     else:  # ordinary string
 602                         if stashed:
 603                             yield stashed
 604                             stashed = None
 605                         yield (STRING, token, spos, epos, line)
 606                 elif initial.isidentifier():  # ordinary name
 607                     if token in ("async", "await"):
 608                         if async_keywords or async_def:
 609                             yield (
 610                                 ASYNC if token == "async" else AWAIT,
 611                                 token,
 612                                 spos,
 613                                 epos,
 614                                 line,
 615                             )
 616                             continue
 617
 618                     tok = (NAME, token, spos, epos, line)
 619                     if token == "async" and not stashed:
 620                         stashed = tok
 621                         continue
 622
 623                     if token in ("def", "for"):
 624                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 625
 626                             if token == "def":
 627                                 async_def = True
 628                                 async_def_indent = indents[-1]
 629
 630                             yield (
 631                                 ASYNC,
 632                                 stashed[1],
 633                                 stashed[2],
 634                                 stashed[3],
 635                                 stashed[4],
 636                             )
 637                             stashed = None
 638
 639                     if stashed:
 640                         yield stashed
 641                         stashed = None
 642
 643                     yield tok
 644                 elif initial == "\\":  # continued stmt
 645                     # This yield is new; needed for better idempotency:
 646                     if stashed:
 647                         yield stashed
 648                         stashed = None
 649                     yield (NL, token, spos, (lnum, pos), line)
 650                     continued = 1
 651                 else:
 652                     if initial in "([{":
 653                         parenlev = parenlev + 1
 654                     elif initial in ")]}":
 655                         parenlev = parenlev - 1
 656                     if stashed:
 657                         yield stashed
 658                         stashed = None
 659                     yield (OP, token, spos, epos, line)
 660             else:
 661                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 662                 pos = pos + 1
 663
 664     if stashed:
 665         yield stashed
 666         stashed = None
 667
 668     for indent in indents[1:]:  # pop remaining indent levels
 669         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 670     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 671
 672
 673 if __name__ == "__main__":  # testing
 674     import sys
 675
 676     if len(sys.argv) > 1:
 677         tokenize(open(sys.argv[1]).readline)
 678     else:
 679         tokenize(sys.stdin.readline)