src/blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 import sys
  31 from typing import (
  32     Callable,
  33     Iterable,
  34     Iterator,
  35     List,
  36     Optional,
  37     Set,
  38     Tuple,
  39     Pattern,
  40     Union,
  41     cast,
  42 )
  43
  44 from typing import Final
  45
  46 from blib2to3.pgen2.token import *
  47 from blib2to3.pgen2.grammar import Grammar
  48
  49 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  50 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  51
  52 import re
  53 from codecs import BOM_UTF8, lookup
  54 from blib2to3.pgen2.token import *
  55
  56 from . import token
  57
  58 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  59     "tokenize",
  60     "generate_tokens",
  61     "untokenize",
  62 ]
  63 del token
  64
  65
  66 def group(*choices: str) -> str:
  67     return "(" + "|".join(choices) + ")"
  68
  69
  70 def any(*choices: str) -> str:
  71     return group(*choices) + "*"
  72
  73
  74 def maybe(*choices: str) -> str:
  75     return group(*choices) + "?"
  76
  77
  78 def _combinations(*l: str) -> Set[str]:
  79     return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
  80
  81
  82 Whitespace = r"[ \f\t]*"
  83 Comment = r"#[^\r\n]*"
  84 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  85 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
  86     r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
  87 )
  88
  89 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  90 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  91 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  92 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  93 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  94 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  95 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  96     Exponent
  97 )
  98 Expfloat = r"\d+(?:_\d+)*" + Exponent
  99 Floatnumber = group(Pointfloat, Expfloat)
 100 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
 101 Number = group(Imagnumber, Floatnumber, Intnumber)
 102
 103 # Tail end of ' string.
 104 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 105 # Tail end of " string.
 106 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 107 # Tail end of ''' string.
 108 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 109 # Tail end of """ string.
 110 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 111 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 112 Triple = group(_litprefix + "'''", _litprefix + '"""')
 113 # Single-line ' or " string.
 114 String = group(
 115     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 116     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 117 )
 118
 119 # Because of leftmost-then-longest match semantics, be sure to put the
 120 # longest operators first (e.g., if = came before ==, == would get
 121 # recognized as two instances of =).
 122 Operator = group(
 123     r"\*\*=?",
 124     r">>=?",
 125     r"<<=?",
 126     r"<>",
 127     r"!=",
 128     r"//=?",
 129     r"->",
 130     r"[+\-*/%&@|^=<>:]=?",
 131     r"~",
 132 )
 133
 134 Bracket = "[][(){}]"
 135 Special = group(r"\r?\n", r"[:;.,`@]")
 136 Funny = group(Operator, Bracket, Special)
 137
 138 # First (or only) line of ' or " string.
 139 ContStr = group(
 140     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 141     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 142 )
 143 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 144 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 145
 146 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
 147 single3prog = re.compile(Single3)
 148 double3prog = re.compile(Double3)
 149
 150 _strprefixes = (
 151     _combinations("r", "R", "f", "F")
 152     | _combinations("r", "R", "b", "B")
 153     | {"u", "U", "ur", "uR", "Ur", "UR"}
 154 )
 155
 156 endprogs: Final = {
 157     "'": re.compile(Single),
 158     '"': re.compile(Double),
 159     "'''": single3prog,
 160     '"""': double3prog,
 161     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 162     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 163 }
 164
 165 triple_quoted: Final = (
 166     {"'''", '"""'}
 167     | {f"{prefix}'''" for prefix in _strprefixes}
 168     | {f'{prefix}"""' for prefix in _strprefixes}
 169 )
 170 single_quoted: Final = (
 171     {"'", '"'}
 172     | {f"{prefix}'" for prefix in _strprefixes}
 173     | {f'{prefix}"' for prefix in _strprefixes}
 174 )
 175
 176 tabsize = 8
 177
 178
 179 class TokenError(Exception):
 180     pass
 181
 182
 183 class StopTokenizing(Exception):
 184     pass
 185
 186
 187 Coord = Tuple[int, int]
 188
 189
 190 def printtoken(
 191     type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
 192 ) -> None:  # for testing
 193     (srow, scol) = srow_col
 194     (erow, ecol) = erow_col
 195     print(
 196         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 197     )
 198
 199
 200 TokenEater = Callable[[int, str, Coord, Coord, str], None]
 201
 202
 203 def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
 204     """
 205     The tokenize() function accepts two parameters: one representing the
 206     input stream, and one providing an output mechanism for tokenize().
 207
 208     The first parameter, readline, must be a callable object which provides
 209     the same interface as the readline() method of built-in file objects.
 210     Each call to the function should return one line of input as a string.
 211
 212     The second parameter, tokeneater, must also be a callable object. It is
 213     called once for each token, with five arguments, corresponding to the
 214     tuples generated by generate_tokens().
 215     """
 216     try:
 217         tokenize_loop(readline, tokeneater)
 218     except StopTokenizing:
 219         pass
 220
 221
 222 # backwards compatible interface
 223 def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
 224     for token_info in generate_tokens(readline):
 225         tokeneater(*token_info)
 226
 227
 228 GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
 229 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 230
 231
 232 class Untokenizer:
 233     tokens: List[str]
 234     prev_row: int
 235     prev_col: int
 236
 237     def __init__(self) -> None:
 238         self.tokens = []
 239         self.prev_row = 1
 240         self.prev_col = 0
 241
 242     def add_whitespace(self, start: Coord) -> None:
 243         row, col = start
 244         assert row <= self.prev_row
 245         col_offset = col - self.prev_col
 246         if col_offset:
 247             self.tokens.append(" " * col_offset)
 248
 249     def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
 250         for t in iterable:
 251             if len(t) == 2:
 252                 self.compat(cast(Tuple[int, str], t), iterable)
 253                 break
 254             tok_type, token, start, end, line = cast(
 255                 Tuple[int, str, Coord, Coord, str], t
 256             )
 257             self.add_whitespace(start)
 258             self.tokens.append(token)
 259             self.prev_row, self.prev_col = end
 260             if tok_type in (NEWLINE, NL):
 261                 self.prev_row += 1
 262                 self.prev_col = 0
 263         return "".join(self.tokens)
 264
 265     def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
 266         startline = False
 267         indents = []
 268         toks_append = self.tokens.append
 269         toknum, tokval = token
 270         if toknum in (NAME, NUMBER):
 271             tokval += " "
 272         if toknum in (NEWLINE, NL):
 273             startline = True
 274         for tok in iterable:
 275             toknum, tokval = tok[:2]
 276
 277             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 278                 tokval += " "
 279
 280             if toknum == INDENT:
 281                 indents.append(tokval)
 282                 continue
 283             elif toknum == DEDENT:
 284                 indents.pop()
 285                 continue
 286             elif toknum in (NEWLINE, NL):
 287                 startline = True
 288             elif startline and indents:
 289                 toks_append(indents[-1])
 290                 startline = False
 291             toks_append(tokval)
 292
 293
 294 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 295 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 296
 297
 298 def _get_normal_name(orig_enc: str) -> str:
 299     """Imitates get_normal_name in tokenizer.c."""
 300     # Only care about the first 12 characters.
 301     enc = orig_enc[:12].lower().replace("_", "-")
 302     if enc == "utf-8" or enc.startswith("utf-8-"):
 303         return "utf-8"
 304     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 305         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 306     ):
 307         return "iso-8859-1"
 308     return orig_enc
 309
 310
 311 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 312     """
 313     The detect_encoding() function is used to detect the encoding that should
 314     be used to decode a Python source file. It requires one argument, readline,
 315     in the same way as the tokenize() generator.
 316
 317     It will call readline a maximum of twice, and return the encoding used
 318     (as a string) and a list of any lines (left as bytes) it has read
 319     in.
 320
 321     It detects the encoding from the presence of a utf-8 bom or an encoding
 322     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 323     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 324     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 325     'utf-8-sig' is returned.
 326
 327     If no encoding is specified, then the default of 'utf-8' will be returned.
 328     """
 329     bom_found = False
 330     encoding = None
 331     default = "utf-8"
 332
 333     def read_or_stop() -> bytes:
 334         try:
 335             return readline()
 336         except StopIteration:
 337             return b''
 338
 339     def find_cookie(line: bytes) -> Optional[str]:
 340         try:
 341             line_string = line.decode("ascii")
 342         except UnicodeDecodeError:
 343             return None
 344         match = cookie_re.match(line_string)
 345         if not match:
 346             return None
 347         encoding = _get_normal_name(match.group(1))
 348         try:
 349             codec = lookup(encoding)
 350         except LookupError:
 351             # This behaviour mimics the Python interpreter
 352             raise SyntaxError("unknown encoding: " + encoding)
 353
 354         if bom_found:
 355             if codec.name != "utf-8":
 356                 # This behaviour mimics the Python interpreter
 357                 raise SyntaxError("encoding problem: utf-8")
 358             encoding += "-sig"
 359         return encoding
 360
 361     first = read_or_stop()
 362     if first.startswith(BOM_UTF8):
 363         bom_found = True
 364         first = first[3:]
 365         default = "utf-8-sig"
 366     if not first:
 367         return default, []
 368
 369     encoding = find_cookie(first)
 370     if encoding:
 371         return encoding, [first]
 372     if not blank_re.match(first):
 373         return default, [first]
 374
 375     second = read_or_stop()
 376     if not second:
 377         return default, [first]
 378
 379     encoding = find_cookie(second)
 380     if encoding:
 381         return encoding, [first, second]
 382
 383     return default, [first, second]
 384
 385
 386 def untokenize(iterable: Iterable[TokenInfo]) -> str:
 387     """Transform tokens back into Python source code.
 388
 389     Each element returned by the iterable must be a token sequence
 390     with at least two elements, a token number and token value.  If
 391     only two tokens are passed, the resulting output is poor.
 392
 393     Round-trip invariant for full input:
 394         Untokenized source will match input source exactly
 395
 396     Round-trip invariant for limited input:
 397         # Output text will tokenize the back to the input
 398         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 399         newcode = untokenize(t1)
 400         readline = iter(newcode.splitlines(1)).next
 401         t2 = [tok[:2] for tokin generate_tokens(readline)]
 402         assert t1 == t2
 403     """
 404     ut = Untokenizer()
 405     return ut.untokenize(iterable)
 406
 407
 408 def generate_tokens(
 409     readline: Callable[[], str], grammar: Optional[Grammar] = None
 410 ) -> Iterator[GoodTokenInfo]:
 411     """
 412     The generate_tokens() generator requires one argument, readline, which
 413     must be a callable object which provides the same interface as the
 414     readline() method of built-in file objects. Each call to the function
 415     should return one line of input as a string.  Alternately, readline
 416     can be a callable function terminating with StopIteration:
 417         readline = open(myfile).next    # Example of alternate readline
 418
 419     The generator produces 5-tuples with these members: the token type; the
 420     token string; a 2-tuple (srow, scol) of ints specifying the row and
 421     column where the token begins in the source; a 2-tuple (erow, ecol) of
 422     ints specifying the row and column where the token ends in the source;
 423     and the line on which the token was found. The line passed is the
 424     logical line; continuation lines are included.
 425     """
 426     lnum = parenlev = continued = 0
 427     numchars: Final[str] = "0123456789"
 428     contstr, needcont = "", 0
 429     contline: Optional[str] = None
 430     indents = [0]
 431
 432     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 433     # `await` as keywords.
 434     async_keywords = False if grammar is None else grammar.async_keywords
 435     # 'stashed' and 'async_*' are used for async/await parsing
 436     stashed: Optional[GoodTokenInfo] = None
 437     async_def = False
 438     async_def_indent = 0
 439     async_def_nl = False
 440
 441     strstart: Tuple[int, int]
 442     endprog: Pattern[str]
 443
 444     while 1:  # loop over lines in stream
 445         try:
 446             line = readline()
 447         except StopIteration:
 448             line = ""
 449         lnum += 1
 450         pos, max = 0, len(line)
 451
 452         if contstr:  # continued string
 453             assert contline is not None
 454             if not line:
 455                 raise TokenError("EOF in multi-line string", strstart)
 456             endmatch = endprog.match(line)
 457             if endmatch:
 458                 pos = end = endmatch.end(0)
 459                 yield (
 460                     STRING,
 461                     contstr + line[:end],
 462                     strstart,
 463                     (lnum, end),
 464                     contline + line,
 465                 )
 466                 contstr, needcont = "", 0
 467                 contline = None
 468             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 469                 yield (
 470                     ERRORTOKEN,
 471                     contstr + line,
 472                     strstart,
 473                     (lnum, len(line)),
 474                     contline,
 475                 )
 476                 contstr = ""
 477                 contline = None
 478                 continue
 479             else:
 480                 contstr = contstr + line
 481                 contline = contline + line
 482                 continue
 483
 484         elif parenlev == 0 and not continued:  # new statement
 485             if not line:
 486                 break
 487             column = 0
 488             while pos < max:  # measure leading whitespace
 489                 if line[pos] == " ":
 490                     column += 1
 491                 elif line[pos] == "\t":
 492                     column = (column // tabsize + 1) * tabsize
 493                 elif line[pos] == "\f":
 494                     column = 0
 495                 else:
 496                     break
 497                 pos += 1
 498             if pos == max:
 499                 break
 500
 501             if stashed:
 502                 yield stashed
 503                 stashed = None
 504
 505             if line[pos] in "\r\n":  # skip blank lines
 506                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 507                 continue
 508
 509             if line[pos] == "#":  # skip comments
 510                 comment_token = line[pos:].rstrip("\r\n")
 511                 nl_pos = pos + len(comment_token)
 512                 yield (
 513                     COMMENT,
 514                     comment_token,
 515                     (lnum, pos),
 516                     (lnum, nl_pos),
 517                     line,
 518                 )
 519                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 520                 continue
 521
 522             if column > indents[-1]:  # count indents
 523                 indents.append(column)
 524                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 525
 526             while column < indents[-1]:  # count dedents
 527                 if column not in indents:
 528                     raise IndentationError(
 529                         "unindent does not match any outer indentation level",
 530                         ("<tokenize>", lnum, pos, line),
 531                     )
 532                 indents = indents[:-1]
 533
 534                 if async_def and async_def_indent >= indents[-1]:
 535                     async_def = False
 536                     async_def_nl = False
 537                     async_def_indent = 0
 538
 539                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 540
 541             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 542                 async_def = False
 543                 async_def_nl = False
 544                 async_def_indent = 0
 545
 546         else:  # continued statement
 547             if not line:
 548                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 549             continued = 0
 550
 551         while pos < max:
 552             pseudomatch = pseudoprog.match(line, pos)
 553             if pseudomatch:  # scan for tokens
 554                 start, end = pseudomatch.span(1)
 555                 spos, epos, pos = (lnum, start), (lnum, end), end
 556                 token, initial = line[start:end], line[start]
 557
 558                 if initial in numchars or (
 559                     initial == "." and token != "."
 560                 ):  # ordinary number
 561                     yield (NUMBER, token, spos, epos, line)
 562                 elif initial in "\r\n":
 563                     newline = NEWLINE
 564                     if parenlev > 0:
 565                         newline = NL
 566                     elif async_def:
 567                         async_def_nl = True
 568                     if stashed:
 569                         yield stashed
 570                         stashed = None
 571                     yield (newline, token, spos, epos, line)
 572
 573                 elif initial == "#":
 574                     assert not token.endswith("\n")
 575                     if stashed:
 576                         yield stashed
 577                         stashed = None
 578                     yield (COMMENT, token, spos, epos, line)
 579                 elif token in triple_quoted:
 580                     endprog = endprogs[token]
 581                     endmatch = endprog.match(line, pos)
 582                     if endmatch:  # all on one line
 583                         pos = endmatch.end(0)
 584                         token = line[start:pos]
 585                         if stashed:
 586                             yield stashed
 587                             stashed = None
 588                         yield (STRING, token, spos, (lnum, pos), line)
 589                     else:
 590                         strstart = (lnum, start)  # multiple lines
 591                         contstr = line[start:]
 592                         contline = line
 593                         break
 594                 elif (
 595                     initial in single_quoted
 596                     or token[:2] in single_quoted
 597                     or token[:3] in single_quoted
 598                 ):
 599                     if token[-1] == "\n":  # continued string
 600                         strstart = (lnum, start)
 601                         maybe_endprog = (
 602                             endprogs.get(initial)
 603                             or endprogs.get(token[1])
 604                             or endprogs.get(token[2])
 605                         )
 606                         assert (
 607                             maybe_endprog is not None
 608                         ), f"endprog not found for {token}"
 609                         endprog = maybe_endprog
 610                         contstr, needcont = line[start:], 1
 611                         contline = line
 612                         break
 613                     else:  # ordinary string
 614                         if stashed:
 615                             yield stashed
 616                             stashed = None
 617                         yield (STRING, token, spos, epos, line)
 618                 elif initial.isidentifier():  # ordinary name
 619                     if token in ("async", "await"):
 620                         if async_keywords or async_def:
 621                             yield (
 622                                 ASYNC if token == "async" else AWAIT,
 623                                 token,
 624                                 spos,
 625                                 epos,
 626                                 line,
 627                             )
 628                             continue
 629
 630                     tok = (NAME, token, spos, epos, line)
 631                     if token == "async" and not stashed:
 632                         stashed = tok
 633                         continue
 634
 635                     if token in ("def", "for"):
 636                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 637                             if token == "def":
 638                                 async_def = True
 639                                 async_def_indent = indents[-1]
 640
 641                             yield (
 642                                 ASYNC,
 643                                 stashed[1],
 644                                 stashed[2],
 645                                 stashed[3],
 646                                 stashed[4],
 647                             )
 648                             stashed = None
 649
 650                     if stashed:
 651                         yield stashed
 652                         stashed = None
 653
 654                     yield tok
 655                 elif initial == "\\":  # continued stmt
 656                     # This yield is new; needed for better idempotency:
 657                     if stashed:
 658                         yield stashed
 659                         stashed = None
 660                     yield (NL, token, spos, (lnum, pos), line)
 661                     continued = 1
 662                 else:
 663                     if initial in "([{":
 664                         parenlev += 1
 665                     elif initial in ")]}":
 666                         parenlev -= 1
 667                     if stashed:
 668                         yield stashed
 669                         stashed = None
 670                     yield (OP, token, spos, epos, line)
 671             else:
 672                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 673                 pos += 1
 674
 675     if stashed:
 676         yield stashed
 677         stashed = None
 678
 679     for indent in indents[1:]:  # pop remaining indent levels
 680         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 681     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 682
 683
 684 if __name__ == "__main__":  # testing
 685     import sys
 686
 687     if len(sys.argv) > 1:
 688         tokenize(open(sys.argv[1]).readline)
 689     else:
 690         tokenize(sys.stdin.readline)