src/blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 import sys
  31 from typing import (
  32     Callable,
  33     Iterable,
  34     Iterator,
  35     List,
  36     Optional,
  37     Text,
  38     Tuple,
  39     Pattern,
  40     Union,
  41     cast,
  42 )
  43
  44 if sys.version_info >= (3, 8):
  45     from typing import Final
  46 else:
  47     from typing_extensions import Final
  48
  49 from blib2to3.pgen2.token import *
  50 from blib2to3.pgen2.grammar import Grammar
  51
  52 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  53 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  54
  55 import re
  56 from codecs import BOM_UTF8, lookup
  57 from blib2to3.pgen2.token import *
  58
  59 from . import token
  60
  61 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  62     "tokenize",
  63     "generate_tokens",
  64     "untokenize",
  65 ]
  66 del token
  67
  68
  69 def group(*choices):
  70     return "(" + "|".join(choices) + ")"
  71
  72
  73 def any(*choices):
  74     return group(*choices) + "*"
  75
  76
  77 def maybe(*choices):
  78     return group(*choices) + "?"
  79
  80
  81 def _combinations(*l):
  82     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  83
  84
  85 Whitespace = r"[ \f\t]*"
  86 Comment = r"#[^\r\n]*"
  87 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  88 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
  89     r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
  90 )
  91
  92 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  93 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  94 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  95 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  96 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  97 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  98 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  99     Exponent
 100 )
 101 Expfloat = r"\d+(?:_\d+)*" + Exponent
 102 Floatnumber = group(Pointfloat, Expfloat)
 103 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
 104 Number = group(Imagnumber, Floatnumber, Intnumber)
 105
 106 # Tail end of ' string.
 107 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 108 # Tail end of " string.
 109 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 110 # Tail end of ''' string.
 111 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 112 # Tail end of """ string.
 113 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 114 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 115 Triple = group(_litprefix + "'''", _litprefix + '"""')
 116 # Single-line ' or " string.
 117 String = group(
 118     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 119     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 120 )
 121
 122 # Because of leftmost-then-longest match semantics, be sure to put the
 123 # longest operators first (e.g., if = came before ==, == would get
 124 # recognized as two instances of =).
 125 Operator = group(
 126     r"\*\*=?",
 127     r">>=?",
 128     r"<<=?",
 129     r"<>",
 130     r"!=",
 131     r"//=?",
 132     r"->",
 133     r"[+\-*/%&@|^=<>:]=?",
 134     r"~",
 135 )
 136
 137 Bracket = "[][(){}]"
 138 Special = group(r"\r?\n", r"[:;.,`@]")
 139 Funny = group(Operator, Bracket, Special)
 140
 141 # First (or only) line of ' or " string.
 142 ContStr = group(
 143     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 144     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 145 )
 146 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 147 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 148
 149 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
 150 single3prog = re.compile(Single3)
 151 double3prog = re.compile(Double3)
 152
 153 _strprefixes = (
 154     _combinations("r", "R", "f", "F")
 155     | _combinations("r", "R", "b", "B")
 156     | {"u", "U", "ur", "uR", "Ur", "UR"}
 157 )
 158
 159 endprogs: Final = {
 160     "'": re.compile(Single),
 161     '"': re.compile(Double),
 162     "'''": single3prog,
 163     '"""': double3prog,
 164     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 165     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 166 }
 167
 168 triple_quoted: Final = (
 169     {"'''", '"""'}
 170     | {f"{prefix}'''" for prefix in _strprefixes}
 171     | {f'{prefix}"""' for prefix in _strprefixes}
 172 )
 173 single_quoted: Final = (
 174     {"'", '"'}
 175     | {f"{prefix}'" for prefix in _strprefixes}
 176     | {f'{prefix}"' for prefix in _strprefixes}
 177 )
 178
 179 tabsize = 8
 180
 181
 182 class TokenError(Exception):
 183     pass
 184
 185
 186 class StopTokenizing(Exception):
 187     pass
 188
 189
 190 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 191     (srow, scol) = xxx_todo_changeme
 192     (erow, ecol) = xxx_todo_changeme1
 193     print(
 194         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 195     )
 196
 197
 198 Coord = Tuple[int, int]
 199 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
 200
 201
 202 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
 203     """
 204     The tokenize() function accepts two parameters: one representing the
 205     input stream, and one providing an output mechanism for tokenize().
 206
 207     The first parameter, readline, must be a callable object which provides
 208     the same interface as the readline() method of built-in file objects.
 209     Each call to the function should return one line of input as a string.
 210
 211     The second parameter, tokeneater, must also be a callable object. It is
 212     called once for each token, with five arguments, corresponding to the
 213     tuples generated by generate_tokens().
 214     """
 215     try:
 216         tokenize_loop(readline, tokeneater)
 217     except StopTokenizing:
 218         pass
 219
 220
 221 # backwards compatible interface
 222 def tokenize_loop(readline, tokeneater):
 223     for token_info in generate_tokens(readline):
 224         tokeneater(*token_info)
 225
 226
 227 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
 228 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 229
 230
 231 class Untokenizer:
 232
 233     tokens: List[Text]
 234     prev_row: int
 235     prev_col: int
 236
 237     def __init__(self) -> None:
 238         self.tokens = []
 239         self.prev_row = 1
 240         self.prev_col = 0
 241
 242     def add_whitespace(self, start: Coord) -> None:
 243         row, col = start
 244         assert row <= self.prev_row
 245         col_offset = col - self.prev_col
 246         if col_offset:
 247             self.tokens.append(" " * col_offset)
 248
 249     def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
 250         for t in iterable:
 251             if len(t) == 2:
 252                 self.compat(cast(Tuple[int, str], t), iterable)
 253                 break
 254             tok_type, token, start, end, line = cast(
 255                 Tuple[int, Text, Coord, Coord, Text], t
 256             )
 257             self.add_whitespace(start)
 258             self.tokens.append(token)
 259             self.prev_row, self.prev_col = end
 260             if tok_type in (NEWLINE, NL):
 261                 self.prev_row += 1
 262                 self.prev_col = 0
 263         return "".join(self.tokens)
 264
 265     def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
 266         startline = False
 267         indents = []
 268         toks_append = self.tokens.append
 269         toknum, tokval = token
 270         if toknum in (NAME, NUMBER):
 271             tokval += " "
 272         if toknum in (NEWLINE, NL):
 273             startline = True
 274         for tok in iterable:
 275             toknum, tokval = tok[:2]
 276
 277             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 278                 tokval += " "
 279
 280             if toknum == INDENT:
 281                 indents.append(tokval)
 282                 continue
 283             elif toknum == DEDENT:
 284                 indents.pop()
 285                 continue
 286             elif toknum in (NEWLINE, NL):
 287                 startline = True
 288             elif startline and indents:
 289                 toks_append(indents[-1])
 290                 startline = False
 291             toks_append(tokval)
 292
 293
 294 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 295 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 296
 297
 298 def _get_normal_name(orig_enc: str) -> str:
 299     """Imitates get_normal_name in tokenizer.c."""
 300     # Only care about the first 12 characters.
 301     enc = orig_enc[:12].lower().replace("_", "-")
 302     if enc == "utf-8" or enc.startswith("utf-8-"):
 303         return "utf-8"
 304     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 305         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 306     ):
 307         return "iso-8859-1"
 308     return orig_enc
 309
 310
 311 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 312     """
 313     The detect_encoding() function is used to detect the encoding that should
 314     be used to decode a Python source file. It requires one argument, readline,
 315     in the same way as the tokenize() generator.
 316
 317     It will call readline a maximum of twice, and return the encoding used
 318     (as a string) and a list of any lines (left as bytes) it has read
 319     in.
 320
 321     It detects the encoding from the presence of a utf-8 bom or an encoding
 322     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 323     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 324     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 325     'utf-8-sig' is returned.
 326
 327     If no encoding is specified, then the default of 'utf-8' will be returned.
 328     """
 329     bom_found = False
 330     encoding = None
 331     default = "utf-8"
 332
 333     def read_or_stop() -> bytes:
 334         try:
 335             return readline()
 336         except StopIteration:
 337             return bytes()
 338
 339     def find_cookie(line: bytes) -> Optional[str]:
 340         try:
 341             line_string = line.decode("ascii")
 342         except UnicodeDecodeError:
 343             return None
 344         match = cookie_re.match(line_string)
 345         if not match:
 346             return None
 347         encoding = _get_normal_name(match.group(1))
 348         try:
 349             codec = lookup(encoding)
 350         except LookupError:
 351             # This behaviour mimics the Python interpreter
 352             raise SyntaxError("unknown encoding: " + encoding)
 353
 354         if bom_found:
 355             if codec.name != "utf-8":
 356                 # This behaviour mimics the Python interpreter
 357                 raise SyntaxError("encoding problem: utf-8")
 358             encoding += "-sig"
 359         return encoding
 360
 361     first = read_or_stop()
 362     if first.startswith(BOM_UTF8):
 363         bom_found = True
 364         first = first[3:]
 365         default = "utf-8-sig"
 366     if not first:
 367         return default, []
 368
 369     encoding = find_cookie(first)
 370     if encoding:
 371         return encoding, [first]
 372     if not blank_re.match(first):
 373         return default, [first]
 374
 375     second = read_or_stop()
 376     if not second:
 377         return default, [first]
 378
 379     encoding = find_cookie(second)
 380     if encoding:
 381         return encoding, [first, second]
 382
 383     return default, [first, second]
 384
 385
 386 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 387     """Transform tokens back into Python source code.
 388
 389     Each element returned by the iterable must be a token sequence
 390     with at least two elements, a token number and token value.  If
 391     only two tokens are passed, the resulting output is poor.
 392
 393     Round-trip invariant for full input:
 394         Untokenized source will match input source exactly
 395
 396     Round-trip invariant for limited input:
 397         # Output text will tokenize the back to the input
 398         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 399         newcode = untokenize(t1)
 400         readline = iter(newcode.splitlines(1)).next
 401         t2 = [tok[:2] for tokin generate_tokens(readline)]
 402         assert t1 == t2
 403     """
 404     ut = Untokenizer()
 405     return ut.untokenize(iterable)
 406
 407
 408 def generate_tokens(
 409     readline: Callable[[], Text], grammar: Optional[Grammar] = None
 410 ) -> Iterator[GoodTokenInfo]:
 411     """
 412     The generate_tokens() generator requires one argument, readline, which
 413     must be a callable object which provides the same interface as the
 414     readline() method of built-in file objects. Each call to the function
 415     should return one line of input as a string.  Alternately, readline
 416     can be a callable function terminating with StopIteration:
 417         readline = open(myfile).next    # Example of alternate readline
 418
 419     The generator produces 5-tuples with these members: the token type; the
 420     token string; a 2-tuple (srow, scol) of ints specifying the row and
 421     column where the token begins in the source; a 2-tuple (erow, ecol) of
 422     ints specifying the row and column where the token ends in the source;
 423     and the line on which the token was found. The line passed is the
 424     logical line; continuation lines are included.
 425     """
 426     lnum = parenlev = continued = 0
 427     numchars: Final[str] = "0123456789"
 428     contstr, needcont = "", 0
 429     contline: Optional[str] = None
 430     indents = [0]
 431
 432     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 433     # `await` as keywords.
 434     async_keywords = False if grammar is None else grammar.async_keywords
 435     # 'stashed' and 'async_*' are used for async/await parsing
 436     stashed: Optional[GoodTokenInfo] = None
 437     async_def = False
 438     async_def_indent = 0
 439     async_def_nl = False
 440
 441     strstart: Tuple[int, int]
 442     endprog: Pattern[str]
 443
 444     while 1:  # loop over lines in stream
 445         try:
 446             line = readline()
 447         except StopIteration:
 448             line = ""
 449         lnum += 1
 450         pos, max = 0, len(line)
 451
 452         if contstr:  # continued string
 453             assert contline is not None
 454             if not line:
 455                 raise TokenError("EOF in multi-line string", strstart)
 456             endmatch = endprog.match(line)
 457             if endmatch:
 458                 pos = end = endmatch.end(0)
 459                 yield (
 460                     STRING,
 461                     contstr + line[:end],
 462                     strstart,
 463                     (lnum, end),
 464                     contline + line,
 465                 )
 466                 contstr, needcont = "", 0
 467                 contline = None
 468             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 469                 yield (
 470                     ERRORTOKEN,
 471                     contstr + line,
 472                     strstart,
 473                     (lnum, len(line)),
 474                     contline,
 475                 )
 476                 contstr = ""
 477                 contline = None
 478                 continue
 479             else:
 480                 contstr = contstr + line
 481                 contline = contline + line
 482                 continue
 483
 484         elif parenlev == 0 and not continued:  # new statement
 485             if not line:
 486                 break
 487             column = 0
 488             while pos < max:  # measure leading whitespace
 489                 if line[pos] == " ":
 490                     column += 1
 491                 elif line[pos] == "\t":
 492                     column = (column // tabsize + 1) * tabsize
 493                 elif line[pos] == "\f":
 494                     column = 0
 495                 else:
 496                     break
 497                 pos += 1
 498             if pos == max:
 499                 break
 500
 501             if stashed:
 502                 yield stashed
 503                 stashed = None
 504
 505             if line[pos] in "\r\n":  # skip blank lines
 506                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 507                 continue
 508
 509             if line[pos] == "#":  # skip comments
 510                 comment_token = line[pos:].rstrip("\r\n")
 511                 nl_pos = pos + len(comment_token)
 512                 yield (
 513                     COMMENT,
 514                     comment_token,
 515                     (lnum, pos),
 516                     (lnum, nl_pos),
 517                     line,
 518                 )
 519                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 520                 continue
 521
 522             if column > indents[-1]:  # count indents
 523                 indents.append(column)
 524                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 525
 526             while column < indents[-1]:  # count dedents
 527                 if column not in indents:
 528                     raise IndentationError(
 529                         "unindent does not match any outer indentation level",
 530                         ("<tokenize>", lnum, pos, line),
 531                     )
 532                 indents = indents[:-1]
 533
 534                 if async_def and async_def_indent >= indents[-1]:
 535                     async_def = False
 536                     async_def_nl = False
 537                     async_def_indent = 0
 538
 539                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 540
 541             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 542                 async_def = False
 543                 async_def_nl = False
 544                 async_def_indent = 0
 545
 546         else:  # continued statement
 547             if not line:
 548                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 549             continued = 0
 550
 551         while pos < max:
 552             pseudomatch = pseudoprog.match(line, pos)
 553             if pseudomatch:  # scan for tokens
 554                 start, end = pseudomatch.span(1)
 555                 spos, epos, pos = (lnum, start), (lnum, end), end
 556                 token, initial = line[start:end], line[start]
 557
 558                 if initial in numchars or (
 559                     initial == "." and token != "."
 560                 ):  # ordinary number
 561                     yield (NUMBER, token, spos, epos, line)
 562                 elif initial in "\r\n":
 563                     newline = NEWLINE
 564                     if parenlev > 0:
 565                         newline = NL
 566                     elif async_def:
 567                         async_def_nl = True
 568                     if stashed:
 569                         yield stashed
 570                         stashed = None
 571                     yield (newline, token, spos, epos, line)
 572
 573                 elif initial == "#":
 574                     assert not token.endswith("\n")
 575                     if stashed:
 576                         yield stashed
 577                         stashed = None
 578                     yield (COMMENT, token, spos, epos, line)
 579                 elif token in triple_quoted:
 580                     endprog = endprogs[token]
 581                     endmatch = endprog.match(line, pos)
 582                     if endmatch:  # all on one line
 583                         pos = endmatch.end(0)
 584                         token = line[start:pos]
 585                         if stashed:
 586                             yield stashed
 587                             stashed = None
 588                         yield (STRING, token, spos, (lnum, pos), line)
 589                     else:
 590                         strstart = (lnum, start)  # multiple lines
 591                         contstr = line[start:]
 592                         contline = line
 593                         break
 594                 elif (
 595                     initial in single_quoted
 596                     or token[:2] in single_quoted
 597                     or token[:3] in single_quoted
 598                 ):
 599                     if token[-1] == "\n":  # continued string
 600                         strstart = (lnum, start)
 601                         maybe_endprog = (
 602                             endprogs.get(initial)
 603                             or endprogs.get(token[1])
 604                             or endprogs.get(token[2])
 605                         )
 606                         assert maybe_endprog is not None, f"endprog not found for {token}"
 607                         endprog = maybe_endprog
 608                         contstr, needcont = line[start:], 1
 609                         contline = line
 610                         break
 611                     else:  # ordinary string
 612                         if stashed:
 613                             yield stashed
 614                             stashed = None
 615                         yield (STRING, token, spos, epos, line)
 616                 elif initial.isidentifier():  # ordinary name
 617                     if token in ("async", "await"):
 618                         if async_keywords or async_def:
 619                             yield (
 620                                 ASYNC if token == "async" else AWAIT,
 621                                 token,
 622                                 spos,
 623                                 epos,
 624                                 line,
 625                             )
 626                             continue
 627
 628                     tok = (NAME, token, spos, epos, line)
 629                     if token == "async" and not stashed:
 630                         stashed = tok
 631                         continue
 632
 633                     if token in ("def", "for"):
 634                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 635
 636                             if token == "def":
 637                                 async_def = True
 638                                 async_def_indent = indents[-1]
 639
 640                             yield (
 641                                 ASYNC,
 642                                 stashed[1],
 643                                 stashed[2],
 644                                 stashed[3],
 645                                 stashed[4],
 646                             )
 647                             stashed = None
 648
 649                     if stashed:
 650                         yield stashed
 651                         stashed = None
 652
 653                     yield tok
 654                 elif initial == "\\":  # continued stmt
 655                     # This yield is new; needed for better idempotency:
 656                     if stashed:
 657                         yield stashed
 658                         stashed = None
 659                     yield (NL, token, spos, (lnum, pos), line)
 660                     continued = 1
 661                 else:
 662                     if initial in "([{":
 663                         parenlev += 1
 664                     elif initial in ")]}":
 665                         parenlev -= 1
 666                     if stashed:
 667                         yield stashed
 668                         stashed = None
 669                     yield (OP, token, spos, epos, line)
 670             else:
 671                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 672                 pos += 1
 673
 674     if stashed:
 675         yield stashed
 676         stashed = None
 677
 678     for indent in indents[1:]:  # pop remaining indent levels
 679         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 680     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 681
 682
 683 if __name__ == "__main__":  # testing
 684     import sys
 685
 686     if len(sys.argv) > 1:
 687         tokenize(open(sys.argv[1]).readline)
 688     else:
 689         tokenize(sys.stdin.readline)