src/blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 import sys
  31 from typing import (
  32     Callable,
  33     Iterable,
  34     Iterator,
  35     List,
  36     Optional,
  37     Text,
  38     Tuple,
  39     Pattern,
  40     Union,
  41     cast,
  42 )
  43
  44 if sys.version_info >= (3, 8):
  45     from typing import Final
  46 else:
  47     from typing_extensions import Final
  48
  49 from blib2to3.pgen2.token import *
  50 from blib2to3.pgen2.grammar import Grammar
  51
  52 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  53 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  54
  55 import regex as re
  56 from codecs import BOM_UTF8, lookup
  57 from blib2to3.pgen2.token import *
  58
  59 from . import token
  60
  61 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  62     "tokenize",
  63     "generate_tokens",
  64     "untokenize",
  65 ]
  66 del token
  67
  68
  69 def group(*choices):
  70     return "(" + "|".join(choices) + ")"
  71
  72
  73 def any(*choices):
  74     return group(*choices) + "*"
  75
  76
  77 def maybe(*choices):
  78     return group(*choices) + "?"
  79
  80
  81 def _combinations(*l):
  82     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  83
  84
  85 Whitespace = r"[ \f\t]*"
  86 Comment = r"#[^\r\n]*"
  87 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  88 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
  89     r"\w+"
  90 )
  91
  92 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  93 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  94 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  95 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  96 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  97 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  98 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  99     Exponent
 100 )
 101 Expfloat = r"\d+(?:_\d+)*" + Exponent
 102 Floatnumber = group(Pointfloat, Expfloat)
 103 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
 104 Number = group(Imagnumber, Floatnumber, Intnumber)
 105
 106 # Tail end of ' string.
 107 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 108 # Tail end of " string.
 109 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 110 # Tail end of ''' string.
 111 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 112 # Tail end of """ string.
 113 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 114 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 115 Triple = group(_litprefix + "'''", _litprefix + '"""')
 116 # Single-line ' or " string.
 117 String = group(
 118     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 119     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 120 )
 121
 122 # Because of leftmost-then-longest match semantics, be sure to put the
 123 # longest operators first (e.g., if = came before ==, == would get
 124 # recognized as two instances of =).
 125 Operator = group(
 126     r"\*\*=?",
 127     r">>=?",
 128     r"<<=?",
 129     r"<>",
 130     r"!=",
 131     r"//=?",
 132     r"->",
 133     r"[+\-*/%&@|^=<>:]=?",
 134     r"~",
 135 )
 136
 137 Bracket = "[][(){}]"
 138 Special = group(r"\r?\n", r"[:;.,`@]")
 139 Funny = group(Operator, Bracket, Special)
 140
 141 # First (or only) line of ' or " string.
 142 ContStr = group(
 143     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 144     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 145 )
 146 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 147 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 148
 149 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
 150 single3prog = re.compile(Single3)
 151 double3prog = re.compile(Double3)
 152
 153 _strprefixes = (
 154     _combinations("r", "R", "f", "F")
 155     | _combinations("r", "R", "b", "B")
 156     | {"u", "U", "ur", "uR", "Ur", "UR"}
 157 )
 158
 159 endprogs: Final = {
 160     "'": re.compile(Single),
 161     '"': re.compile(Double),
 162     "'''": single3prog,
 163     '"""': double3prog,
 164     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 165     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 166     **{prefix: None for prefix in _strprefixes},
 167 }
 168
 169 triple_quoted: Final = (
 170     {"'''", '"""'}
 171     | {f"{prefix}'''" for prefix in _strprefixes}
 172     | {f'{prefix}"""' for prefix in _strprefixes}
 173 )
 174 single_quoted: Final = (
 175     {"'", '"'}
 176     | {f"{prefix}'" for prefix in _strprefixes}
 177     | {f'{prefix}"' for prefix in _strprefixes}
 178 )
 179
 180 tabsize = 8
 181
 182
 183 class TokenError(Exception):
 184     pass
 185
 186
 187 class StopTokenizing(Exception):
 188     pass
 189
 190
 191 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 192     (srow, scol) = xxx_todo_changeme
 193     (erow, ecol) = xxx_todo_changeme1
 194     print(
 195         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 196     )
 197
 198
 199 Coord = Tuple[int, int]
 200 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
 201
 202
 203 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
 204     """
 205     The tokenize() function accepts two parameters: one representing the
 206     input stream, and one providing an output mechanism for tokenize().
 207
 208     The first parameter, readline, must be a callable object which provides
 209     the same interface as the readline() method of built-in file objects.
 210     Each call to the function should return one line of input as a string.
 211
 212     The second parameter, tokeneater, must also be a callable object. It is
 213     called once for each token, with five arguments, corresponding to the
 214     tuples generated by generate_tokens().
 215     """
 216     try:
 217         tokenize_loop(readline, tokeneater)
 218     except StopTokenizing:
 219         pass
 220
 221
 222 # backwards compatible interface
 223 def tokenize_loop(readline, tokeneater):
 224     for token_info in generate_tokens(readline):
 225         tokeneater(*token_info)
 226
 227
 228 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
 229 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 230
 231
 232 class Untokenizer:
 233
 234     tokens: List[Text]
 235     prev_row: int
 236     prev_col: int
 237
 238     def __init__(self) -> None:
 239         self.tokens = []
 240         self.prev_row = 1
 241         self.prev_col = 0
 242
 243     def add_whitespace(self, start: Coord) -> None:
 244         row, col = start
 245         assert row <= self.prev_row
 246         col_offset = col - self.prev_col
 247         if col_offset:
 248             self.tokens.append(" " * col_offset)
 249
 250     def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
 251         for t in iterable:
 252             if len(t) == 2:
 253                 self.compat(cast(Tuple[int, str], t), iterable)
 254                 break
 255             tok_type, token, start, end, line = cast(
 256                 Tuple[int, Text, Coord, Coord, Text], t
 257             )
 258             self.add_whitespace(start)
 259             self.tokens.append(token)
 260             self.prev_row, self.prev_col = end
 261             if tok_type in (NEWLINE, NL):
 262                 self.prev_row += 1
 263                 self.prev_col = 0
 264         return "".join(self.tokens)
 265
 266     def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
 267         startline = False
 268         indents = []
 269         toks_append = self.tokens.append
 270         toknum, tokval = token
 271         if toknum in (NAME, NUMBER):
 272             tokval += " "
 273         if toknum in (NEWLINE, NL):
 274             startline = True
 275         for tok in iterable:
 276             toknum, tokval = tok[:2]
 277
 278             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 279                 tokval += " "
 280
 281             if toknum == INDENT:
 282                 indents.append(tokval)
 283                 continue
 284             elif toknum == DEDENT:
 285                 indents.pop()
 286                 continue
 287             elif toknum in (NEWLINE, NL):
 288                 startline = True
 289             elif startline and indents:
 290                 toks_append(indents[-1])
 291                 startline = False
 292             toks_append(tokval)
 293
 294
 295 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 296 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 297
 298
 299 def _get_normal_name(orig_enc: str) -> str:
 300     """Imitates get_normal_name in tokenizer.c."""
 301     # Only care about the first 12 characters.
 302     enc = orig_enc[:12].lower().replace("_", "-")
 303     if enc == "utf-8" or enc.startswith("utf-8-"):
 304         return "utf-8"
 305     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 306         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 307     ):
 308         return "iso-8859-1"
 309     return orig_enc
 310
 311
 312 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 313     """
 314     The detect_encoding() function is used to detect the encoding that should
 315     be used to decode a Python source file. It requires one argument, readline,
 316     in the same way as the tokenize() generator.
 317
 318     It will call readline a maximum of twice, and return the encoding used
 319     (as a string) and a list of any lines (left as bytes) it has read
 320     in.
 321
 322     It detects the encoding from the presence of a utf-8 bom or an encoding
 323     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 324     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 325     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 326     'utf-8-sig' is returned.
 327
 328     If no encoding is specified, then the default of 'utf-8' will be returned.
 329     """
 330     bom_found = False
 331     encoding = None
 332     default = "utf-8"
 333
 334     def read_or_stop() -> bytes:
 335         try:
 336             return readline()
 337         except StopIteration:
 338             return bytes()
 339
 340     def find_cookie(line: bytes) -> Optional[str]:
 341         try:
 342             line_string = line.decode("ascii")
 343         except UnicodeDecodeError:
 344             return None
 345         match = cookie_re.match(line_string)
 346         if not match:
 347             return None
 348         encoding = _get_normal_name(match.group(1))
 349         try:
 350             codec = lookup(encoding)
 351         except LookupError:
 352             # This behaviour mimics the Python interpreter
 353             raise SyntaxError("unknown encoding: " + encoding)
 354
 355         if bom_found:
 356             if codec.name != "utf-8":
 357                 # This behaviour mimics the Python interpreter
 358                 raise SyntaxError("encoding problem: utf-8")
 359             encoding += "-sig"
 360         return encoding
 361
 362     first = read_or_stop()
 363     if first.startswith(BOM_UTF8):
 364         bom_found = True
 365         first = first[3:]
 366         default = "utf-8-sig"
 367     if not first:
 368         return default, []
 369
 370     encoding = find_cookie(first)
 371     if encoding:
 372         return encoding, [first]
 373     if not blank_re.match(first):
 374         return default, [first]
 375
 376     second = read_or_stop()
 377     if not second:
 378         return default, [first]
 379
 380     encoding = find_cookie(second)
 381     if encoding:
 382         return encoding, [first, second]
 383
 384     return default, [first, second]
 385
 386
 387 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 388     """Transform tokens back into Python source code.
 389
 390     Each element returned by the iterable must be a token sequence
 391     with at least two elements, a token number and token value.  If
 392     only two tokens are passed, the resulting output is poor.
 393
 394     Round-trip invariant for full input:
 395         Untokenized source will match input source exactly
 396
 397     Round-trip invariant for limited input:
 398         # Output text will tokenize the back to the input
 399         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 400         newcode = untokenize(t1)
 401         readline = iter(newcode.splitlines(1)).next
 402         t2 = [tok[:2] for tokin generate_tokens(readline)]
 403         assert t1 == t2
 404     """
 405     ut = Untokenizer()
 406     return ut.untokenize(iterable)
 407
 408
 409 def generate_tokens(
 410     readline: Callable[[], Text], grammar: Optional[Grammar] = None
 411 ) -> Iterator[GoodTokenInfo]:
 412     """
 413     The generate_tokens() generator requires one argument, readline, which
 414     must be a callable object which provides the same interface as the
 415     readline() method of built-in file objects. Each call to the function
 416     should return one line of input as a string.  Alternately, readline
 417     can be a callable function terminating with StopIteration:
 418         readline = open(myfile).next    # Example of alternate readline
 419
 420     The generator produces 5-tuples with these members: the token type; the
 421     token string; a 2-tuple (srow, scol) of ints specifying the row and
 422     column where the token begins in the source; a 2-tuple (erow, ecol) of
 423     ints specifying the row and column where the token ends in the source;
 424     and the line on which the token was found. The line passed is the
 425     logical line; continuation lines are included.
 426     """
 427     lnum = parenlev = continued = 0
 428     numchars: Final = "0123456789"
 429     contstr, needcont = "", 0
 430     contline: Optional[str] = None
 431     indents = [0]
 432
 433     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 434     # `await` as keywords.
 435     async_keywords = False if grammar is None else grammar.async_keywords
 436     # 'stashed' and 'async_*' are used for async/await parsing
 437     stashed: Optional[GoodTokenInfo] = None
 438     async_def = False
 439     async_def_indent = 0
 440     async_def_nl = False
 441
 442     strstart: Tuple[int, int]
 443     endprog: Pattern[str]
 444
 445     while 1:  # loop over lines in stream
 446         try:
 447             line = readline()
 448         except StopIteration:
 449             line = ""
 450         lnum += 1
 451         pos, max = 0, len(line)
 452
 453         if contstr:  # continued string
 454             assert contline is not None
 455             if not line:
 456                 raise TokenError("EOF in multi-line string", strstart)
 457             endmatch = endprog.match(line)
 458             if endmatch:
 459                 pos = end = endmatch.end(0)
 460                 yield (
 461                     STRING,
 462                     contstr + line[:end],
 463                     strstart,
 464                     (lnum, end),
 465                     contline + line,
 466                 )
 467                 contstr, needcont = "", 0
 468                 contline = None
 469             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 470                 yield (
 471                     ERRORTOKEN,
 472                     contstr + line,
 473                     strstart,
 474                     (lnum, len(line)),
 475                     contline,
 476                 )
 477                 contstr = ""
 478                 contline = None
 479                 continue
 480             else:
 481                 contstr = contstr + line
 482                 contline = contline + line
 483                 continue
 484
 485         elif parenlev == 0 and not continued:  # new statement
 486             if not line:
 487                 break
 488             column = 0
 489             while pos < max:  # measure leading whitespace
 490                 if line[pos] == " ":
 491                     column += 1
 492                 elif line[pos] == "\t":
 493                     column = (column // tabsize + 1) * tabsize
 494                 elif line[pos] == "\f":
 495                     column = 0
 496                 else:
 497                     break
 498                 pos += 1
 499             if pos == max:
 500                 break
 501
 502             if stashed:
 503                 yield stashed
 504                 stashed = None
 505
 506             if line[pos] in "\r\n":  # skip blank lines
 507                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 508                 continue
 509
 510             if line[pos] == "#":  # skip comments
 511                 comment_token = line[pos:].rstrip("\r\n")
 512                 nl_pos = pos + len(comment_token)
 513                 yield (
 514                     COMMENT,
 515                     comment_token,
 516                     (lnum, pos),
 517                     (lnum, nl_pos),
 518                     line,
 519                 )
 520                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 521                 continue
 522
 523             if column > indents[-1]:  # count indents
 524                 indents.append(column)
 525                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 526
 527             while column < indents[-1]:  # count dedents
 528                 if column not in indents:
 529                     raise IndentationError(
 530                         "unindent does not match any outer indentation level",
 531                         ("<tokenize>", lnum, pos, line),
 532                     )
 533                 indents = indents[:-1]
 534
 535                 if async_def and async_def_indent >= indents[-1]:
 536                     async_def = False
 537                     async_def_nl = False
 538                     async_def_indent = 0
 539
 540                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 541
 542             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 543                 async_def = False
 544                 async_def_nl = False
 545                 async_def_indent = 0
 546
 547         else:  # continued statement
 548             if not line:
 549                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 550             continued = 0
 551
 552         while pos < max:
 553             pseudomatch = pseudoprog.match(line, pos)
 554             if pseudomatch:  # scan for tokens
 555                 start, end = pseudomatch.span(1)
 556                 spos, epos, pos = (lnum, start), (lnum, end), end
 557                 token, initial = line[start:end], line[start]
 558
 559                 if initial in numchars or (
 560                     initial == "." and token != "."
 561                 ):  # ordinary number
 562                     yield (NUMBER, token, spos, epos, line)
 563                 elif initial in "\r\n":
 564                     newline = NEWLINE
 565                     if parenlev > 0:
 566                         newline = NL
 567                     elif async_def:
 568                         async_def_nl = True
 569                     if stashed:
 570                         yield stashed
 571                         stashed = None
 572                     yield (newline, token, spos, epos, line)
 573
 574                 elif initial == "#":
 575                     assert not token.endswith("\n")
 576                     if stashed:
 577                         yield stashed
 578                         stashed = None
 579                     yield (COMMENT, token, spos, epos, line)
 580                 elif token in triple_quoted:
 581                     endprog = endprogs[token]
 582                     endmatch = endprog.match(line, pos)
 583                     if endmatch:  # all on one line
 584                         pos = endmatch.end(0)
 585                         token = line[start:pos]
 586                         if stashed:
 587                             yield stashed
 588                             stashed = None
 589                         yield (STRING, token, spos, (lnum, pos), line)
 590                     else:
 591                         strstart = (lnum, start)  # multiple lines
 592                         contstr = line[start:]
 593                         contline = line
 594                         break
 595                 elif (
 596                     initial in single_quoted
 597                     or token[:2] in single_quoted
 598                     or token[:3] in single_quoted
 599                 ):
 600                     if token[-1] == "\n":  # continued string
 601                         strstart = (lnum, start)
 602                         endprog = (
 603                             endprogs[initial]
 604                             or endprogs[token[1]]
 605                             or endprogs[token[2]]
 606                         )
 607                         contstr, needcont = line[start:], 1
 608                         contline = line
 609                         break
 610                     else:  # ordinary string
 611                         if stashed:
 612                             yield stashed
 613                             stashed = None
 614                         yield (STRING, token, spos, epos, line)
 615                 elif initial.isidentifier():  # ordinary name
 616                     if token in ("async", "await"):
 617                         if async_keywords or async_def:
 618                             yield (
 619                                 ASYNC if token == "async" else AWAIT,
 620                                 token,
 621                                 spos,
 622                                 epos,
 623                                 line,
 624                             )
 625                             continue
 626
 627                     tok = (NAME, token, spos, epos, line)
 628                     if token == "async" and not stashed:
 629                         stashed = tok
 630                         continue
 631
 632                     if token in ("def", "for"):
 633                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 634
 635                             if token == "def":
 636                                 async_def = True
 637                                 async_def_indent = indents[-1]
 638
 639                             yield (
 640                                 ASYNC,
 641                                 stashed[1],
 642                                 stashed[2],
 643                                 stashed[3],
 644                                 stashed[4],
 645                             )
 646                             stashed = None
 647
 648                     if stashed:
 649                         yield stashed
 650                         stashed = None
 651
 652                     yield tok
 653                 elif initial == "\\":  # continued stmt
 654                     # This yield is new; needed for better idempotency:
 655                     if stashed:
 656                         yield stashed
 657                         stashed = None
 658                     yield (NL, token, spos, (lnum, pos), line)
 659                     continued = 1
 660                 else:
 661                     if initial in "([{":
 662                         parenlev += 1
 663                     elif initial in ")]}":
 664                         parenlev -= 1
 665                     if stashed:
 666                         yield stashed
 667                         stashed = None
 668                     yield (OP, token, spos, epos, line)
 669             else:
 670                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 671                 pos += 1
 672
 673     if stashed:
 674         yield stashed
 675         stashed = None
 676
 677     for indent in indents[1:]:  # pop remaining indent levels
 678         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 679     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 680
 681
 682 if __name__ == "__main__":  # testing
 683     import sys
 684
 685     if len(sys.argv) > 1:
 686         tokenize(open(sys.argv[1]).readline)
 687     else:
 688         tokenize(sys.stdin.readline)