blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 from typing import (
  31     Callable,
  32     Iterable,
  33     Iterator,
  34     List,
  35     Optional,
  36     Text,
  37     Tuple,
  38     Pattern,
  39     Union,
  40     cast,
  41 )
  42 from blib2to3.pgen2.token import *
  43 from blib2to3.pgen2.grammar import Grammar
  44
  45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  47
  48 import regex as re
  49 from codecs import BOM_UTF8, lookup
  50 from blib2to3.pgen2.token import *
  51
  52 from . import token
  53
  54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  55     "tokenize",
  56     "generate_tokens",
  57     "untokenize",
  58 ]
  59 del token
  60
  61
  62 def group(*choices):
  63     return "(" + "|".join(choices) + ")"
  64
  65
  66 def any(*choices):
  67     return group(*choices) + "*"
  68
  69
  70 def maybe(*choices):
  71     return group(*choices) + "?"
  72
  73
  74 def _combinations(*l):
  75     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  76
  77
  78 Whitespace = r"[ \f\t]*"
  79 Comment = r"#[^\r\n]*"
  80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  81 Name = r"\w+"  # this is invalid but it's fine because Name comes after Number in all groups
  82
  83 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  84 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  85 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  86 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  87 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  88 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  89 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  90     Exponent
  91 )
  92 Expfloat = r"\d+(?:_\d+)*" + Exponent
  93 Floatnumber = group(Pointfloat, Expfloat)
  94 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
  95 Number = group(Imagnumber, Floatnumber, Intnumber)
  96
  97 # Tail end of ' string.
  98 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  99 # Tail end of " string.
 100 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 101 # Tail end of ''' string.
 102 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 103 # Tail end of """ string.
 104 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 105 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 106 Triple = group(_litprefix + "'''", _litprefix + '"""')
 107 # Single-line ' or " string.
 108 String = group(
 109     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 110     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 111 )
 112
 113 # Because of leftmost-then-longest match semantics, be sure to put the
 114 # longest operators first (e.g., if = came before ==, == would get
 115 # recognized as two instances of =).
 116 Operator = group(
 117     r"\*\*=?",
 118     r">>=?",
 119     r"<<=?",
 120     r"<>",
 121     r"!=",
 122     r"//=?",
 123     r"->",
 124     r"[+\-*/%&@|^=<>:]=?",
 125     r"~",
 126 )
 127
 128 Bracket = "[][(){}]"
 129 Special = group(r"\r?\n", r"[:;.,`@]")
 130 Funny = group(Operator, Bracket, Special)
 131
 132 PlainToken = group(Number, Funny, String, Name)
 133 Token = Ignore + PlainToken
 134
 135 # First (or only) line of ' or " string.
 136 ContStr = group(
 137     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 138     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 139 )
 140 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 141 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 142
 143 tokenprog = re.compile(Token, re.UNICODE)
 144 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 145 single3prog = re.compile(Single3)
 146 double3prog = re.compile(Double3)
 147
 148 _strprefixes = (
 149     _combinations("r", "R", "f", "F")
 150     | _combinations("r", "R", "b", "B")
 151     | {"u", "U", "ur", "uR", "Ur", "UR"}
 152 )
 153
 154 endprogs = {
 155     "'": re.compile(Single),
 156     '"': re.compile(Double),
 157     "'''": single3prog,
 158     '"""': double3prog,
 159     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 160     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 161     **{prefix: None for prefix in _strprefixes},
 162 }
 163
 164 triple_quoted = (
 165     {"'''", '"""'}
 166     | {f"{prefix}'''" for prefix in _strprefixes}
 167     | {f'{prefix}"""' for prefix in _strprefixes}
 168 )
 169 single_quoted = (
 170     {"'", '"'}
 171     | {f"{prefix}'" for prefix in _strprefixes}
 172     | {f'{prefix}"' for prefix in _strprefixes}
 173 )
 174
 175 tabsize = 8
 176
 177
 178 class TokenError(Exception):
 179     pass
 180
 181
 182 class StopTokenizing(Exception):
 183     pass
 184
 185
 186 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 187     (srow, scol) = xxx_todo_changeme
 188     (erow, ecol) = xxx_todo_changeme1
 189     print(
 190         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 191     )
 192
 193
 194 Coord = Tuple[int, int]
 195 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
 196
 197
 198 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
 199     """
 200     The tokenize() function accepts two parameters: one representing the
 201     input stream, and one providing an output mechanism for tokenize().
 202
 203     The first parameter, readline, must be a callable object which provides
 204     the same interface as the readline() method of built-in file objects.
 205     Each call to the function should return one line of input as a string.
 206
 207     The second parameter, tokeneater, must also be a callable object. It is
 208     called once for each token, with five arguments, corresponding to the
 209     tuples generated by generate_tokens().
 210     """
 211     try:
 212         tokenize_loop(readline, tokeneater)
 213     except StopTokenizing:
 214         pass
 215
 216
 217 # backwards compatible interface
 218 def tokenize_loop(readline, tokeneater):
 219     for token_info in generate_tokens(readline):
 220         tokeneater(*token_info)
 221
 222
 223 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
 224 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 225
 226
 227 class Untokenizer:
 228
 229     tokens: List[Text]
 230     prev_row: int
 231     prev_col: int
 232
 233     def __init__(self) -> None:
 234         self.tokens = []
 235         self.prev_row = 1
 236         self.prev_col = 0
 237
 238     def add_whitespace(self, start: Coord) -> None:
 239         row, col = start
 240         assert row <= self.prev_row
 241         col_offset = col - self.prev_col
 242         if col_offset:
 243             self.tokens.append(" " * col_offset)
 244
 245     def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
 246         for t in iterable:
 247             if len(t) == 2:
 248                 self.compat(cast(Tuple[int, str], t), iterable)
 249                 break
 250             tok_type, token, start, end, line = cast(
 251                 Tuple[int, Text, Coord, Coord, Text], t
 252             )
 253             self.add_whitespace(start)
 254             self.tokens.append(token)
 255             self.prev_row, self.prev_col = end
 256             if tok_type in (NEWLINE, NL):
 257                 self.prev_row += 1
 258                 self.prev_col = 0
 259         return "".join(self.tokens)
 260
 261     def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
 262         startline = False
 263         indents = []
 264         toks_append = self.tokens.append
 265         toknum, tokval = token
 266         if toknum in (NAME, NUMBER):
 267             tokval += " "
 268         if toknum in (NEWLINE, NL):
 269             startline = True
 270         for tok in iterable:
 271             toknum, tokval = tok[:2]
 272
 273             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 274                 tokval += " "
 275
 276             if toknum == INDENT:
 277                 indents.append(tokval)
 278                 continue
 279             elif toknum == DEDENT:
 280                 indents.pop()
 281                 continue
 282             elif toknum in (NEWLINE, NL):
 283                 startline = True
 284             elif startline and indents:
 285                 toks_append(indents[-1])
 286                 startline = False
 287             toks_append(tokval)
 288
 289
 290 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 291 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 292
 293
 294 def _get_normal_name(orig_enc: str) -> str:
 295     """Imitates get_normal_name in tokenizer.c."""
 296     # Only care about the first 12 characters.
 297     enc = orig_enc[:12].lower().replace("_", "-")
 298     if enc == "utf-8" or enc.startswith("utf-8-"):
 299         return "utf-8"
 300     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 301         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 302     ):
 303         return "iso-8859-1"
 304     return orig_enc
 305
 306
 307 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 308     """
 309     The detect_encoding() function is used to detect the encoding that should
 310     be used to decode a Python source file. It requires one argument, readline,
 311     in the same way as the tokenize() generator.
 312
 313     It will call readline a maximum of twice, and return the encoding used
 314     (as a string) and a list of any lines (left as bytes) it has read
 315     in.
 316
 317     It detects the encoding from the presence of a utf-8 bom or an encoding
 318     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 319     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 320     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 321     'utf-8-sig' is returned.
 322
 323     If no encoding is specified, then the default of 'utf-8' will be returned.
 324     """
 325     bom_found = False
 326     encoding = None
 327     default = "utf-8"
 328
 329     def read_or_stop() -> bytes:
 330         try:
 331             return readline()
 332         except StopIteration:
 333             return bytes()
 334
 335     def find_cookie(line: bytes) -> Optional[str]:
 336         try:
 337             line_string = line.decode("ascii")
 338         except UnicodeDecodeError:
 339             return None
 340         match = cookie_re.match(line_string)
 341         if not match:
 342             return None
 343         encoding = _get_normal_name(match.group(1))
 344         try:
 345             codec = lookup(encoding)
 346         except LookupError:
 347             # This behaviour mimics the Python interpreter
 348             raise SyntaxError("unknown encoding: " + encoding)
 349
 350         if bom_found:
 351             if codec.name != "utf-8":
 352                 # This behaviour mimics the Python interpreter
 353                 raise SyntaxError("encoding problem: utf-8")
 354             encoding += "-sig"
 355         return encoding
 356
 357     first = read_or_stop()
 358     if first.startswith(BOM_UTF8):
 359         bom_found = True
 360         first = first[3:]
 361         default = "utf-8-sig"
 362     if not first:
 363         return default, []
 364
 365     encoding = find_cookie(first)
 366     if encoding:
 367         return encoding, [first]
 368     if not blank_re.match(first):
 369         return default, [first]
 370
 371     second = read_or_stop()
 372     if not second:
 373         return default, [first]
 374
 375     encoding = find_cookie(second)
 376     if encoding:
 377         return encoding, [first, second]
 378
 379     return default, [first, second]
 380
 381
 382 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 383     """Transform tokens back into Python source code.
 384
 385     Each element returned by the iterable must be a token sequence
 386     with at least two elements, a token number and token value.  If
 387     only two tokens are passed, the resulting output is poor.
 388
 389     Round-trip invariant for full input:
 390         Untokenized source will match input source exactly
 391
 392     Round-trip invariant for limited input:
 393         # Output text will tokenize the back to the input
 394         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 395         newcode = untokenize(t1)
 396         readline = iter(newcode.splitlines(1)).next
 397         t2 = [tok[:2] for tokin generate_tokens(readline)]
 398         assert t1 == t2
 399     """
 400     ut = Untokenizer()
 401     return ut.untokenize(iterable)
 402
 403
 404 def generate_tokens(
 405     readline: Callable[[], Text], grammar: Optional[Grammar] = None
 406 ) -> Iterator[GoodTokenInfo]:
 407     """
 408     The generate_tokens() generator requires one argument, readline, which
 409     must be a callable object which provides the same interface as the
 410     readline() method of built-in file objects. Each call to the function
 411     should return one line of input as a string.  Alternately, readline
 412     can be a callable function terminating with StopIteration:
 413         readline = open(myfile).next    # Example of alternate readline
 414
 415     The generator produces 5-tuples with these members: the token type; the
 416     token string; a 2-tuple (srow, scol) of ints specifying the row and
 417     column where the token begins in the source; a 2-tuple (erow, ecol) of
 418     ints specifying the row and column where the token ends in the source;
 419     and the line on which the token was found. The line passed is the
 420     logical line; continuation lines are included.
 421     """
 422     lnum = parenlev = continued = 0
 423     numchars = "0123456789"
 424     contstr, needcont = "", 0
 425     contline: Optional[str] = None
 426     indents = [0]
 427
 428     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 429     # `await` as keywords.
 430     async_keywords = False if grammar is None else grammar.async_keywords
 431     # 'stashed' and 'async_*' are used for async/await parsing
 432     stashed = None
 433     async_def = False
 434     async_def_indent = 0
 435     async_def_nl = False
 436
 437     strstart: Tuple[int, int]
 438     endprog: Pattern[str]
 439
 440     while 1:  # loop over lines in stream
 441         try:
 442             line = readline()
 443         except StopIteration:
 444             line = ""
 445         lnum = lnum + 1
 446         pos, max = 0, len(line)
 447
 448         if contstr:  # continued string
 449             assert contline is not None
 450             if not line:
 451                 raise TokenError("EOF in multi-line string", strstart)
 452             endmatch = endprog.match(line)
 453             if endmatch:
 454                 pos = end = endmatch.end(0)
 455                 yield (
 456                     STRING,
 457                     contstr + line[:end],
 458                     strstart,
 459                     (lnum, end),
 460                     contline + line,
 461                 )
 462                 contstr, needcont = "", 0
 463                 contline = None
 464             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 465                 yield (
 466                     ERRORTOKEN,
 467                     contstr + line,
 468                     strstart,
 469                     (lnum, len(line)),
 470                     contline,
 471                 )
 472                 contstr = ""
 473                 contline = None
 474                 continue
 475             else:
 476                 contstr = contstr + line
 477                 contline = contline + line
 478                 continue
 479
 480         elif parenlev == 0 and not continued:  # new statement
 481             if not line:
 482                 break
 483             column = 0
 484             while pos < max:  # measure leading whitespace
 485                 if line[pos] == " ":
 486                     column = column + 1
 487                 elif line[pos] == "\t":
 488                     column = (column // tabsize + 1) * tabsize
 489                 elif line[pos] == "\f":
 490                     column = 0
 491                 else:
 492                     break
 493                 pos = pos + 1
 494             if pos == max:
 495                 break
 496
 497             if stashed:
 498                 yield stashed
 499                 stashed = None
 500
 501             if line[pos] in "\r\n":  # skip blank lines
 502                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 503                 continue
 504
 505             if line[pos] == "#":  # skip comments
 506                 comment_token = line[pos:].rstrip("\r\n")
 507                 nl_pos = pos + len(comment_token)
 508                 yield (
 509                     COMMENT,
 510                     comment_token,
 511                     (lnum, pos),
 512                     (lnum, pos + len(comment_token)),
 513                     line,
 514                 )
 515                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 516                 continue
 517
 518             if column > indents[-1]:  # count indents
 519                 indents.append(column)
 520                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 521
 522             while column < indents[-1]:  # count dedents
 523                 if column not in indents:
 524                     raise IndentationError(
 525                         "unindent does not match any outer indentation level",
 526                         ("<tokenize>", lnum, pos, line),
 527                     )
 528                 indents = indents[:-1]
 529
 530                 if async_def and async_def_indent >= indents[-1]:
 531                     async_def = False
 532                     async_def_nl = False
 533                     async_def_indent = 0
 534
 535                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 536
 537             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 538                 async_def = False
 539                 async_def_nl = False
 540                 async_def_indent = 0
 541
 542         else:  # continued statement
 543             if not line:
 544                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 545             continued = 0
 546
 547         while pos < max:
 548             pseudomatch = pseudoprog.match(line, pos)
 549             if pseudomatch:  # scan for tokens
 550                 start, end = pseudomatch.span(1)
 551                 spos, epos, pos = (lnum, start), (lnum, end), end
 552                 token, initial = line[start:end], line[start]
 553
 554                 if initial in numchars or (
 555                     initial == "." and token != "."
 556                 ):  # ordinary number
 557                     yield (NUMBER, token, spos, epos, line)
 558                 elif initial in "\r\n":
 559                     newline = NEWLINE
 560                     if parenlev > 0:
 561                         newline = NL
 562                     elif async_def:
 563                         async_def_nl = True
 564                     if stashed:
 565                         yield stashed
 566                         stashed = None
 567                     yield (newline, token, spos, epos, line)
 568
 569                 elif initial == "#":
 570                     assert not token.endswith("\n")
 571                     if stashed:
 572                         yield stashed
 573                         stashed = None
 574                     yield (COMMENT, token, spos, epos, line)
 575                 elif token in triple_quoted:
 576                     endprog = endprogs[token]
 577                     endmatch = endprog.match(line, pos)
 578                     if endmatch:  # all on one line
 579                         pos = endmatch.end(0)
 580                         token = line[start:pos]
 581                         if stashed:
 582                             yield stashed
 583                             stashed = None
 584                         yield (STRING, token, spos, (lnum, pos), line)
 585                     else:
 586                         strstart = (lnum, start)  # multiple lines
 587                         contstr = line[start:]
 588                         contline = line
 589                         break
 590                 elif (
 591                     initial in single_quoted
 592                     or token[:2] in single_quoted
 593                     or token[:3] in single_quoted
 594                 ):
 595                     if token[-1] == "\n":  # continued string
 596                         strstart = (lnum, start)
 597                         endprog = (
 598                             endprogs[initial]
 599                             or endprogs[token[1]]
 600                             or endprogs[token[2]]
 601                         )
 602                         contstr, needcont = line[start:], 1
 603                         contline = line
 604                         break
 605                     else:  # ordinary string
 606                         if stashed:
 607                             yield stashed
 608                             stashed = None
 609                         yield (STRING, token, spos, epos, line)
 610                 elif initial.isidentifier():  # ordinary name
 611                     if token in ("async", "await"):
 612                         if async_keywords or async_def:
 613                             yield (
 614                                 ASYNC if token == "async" else AWAIT,
 615                                 token,
 616                                 spos,
 617                                 epos,
 618                                 line,
 619                             )
 620                             continue
 621
 622                     tok = (NAME, token, spos, epos, line)
 623                     if token == "async" and not stashed:
 624                         stashed = tok
 625                         continue
 626
 627                     if token in ("def", "for"):
 628                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 629
 630                             if token == "def":
 631                                 async_def = True
 632                                 async_def_indent = indents[-1]
 633
 634                             yield (
 635                                 ASYNC,
 636                                 stashed[1],
 637                                 stashed[2],
 638                                 stashed[3],
 639                                 stashed[4],
 640                             )
 641                             stashed = None
 642
 643                     if stashed:
 644                         yield stashed
 645                         stashed = None
 646
 647                     yield tok
 648                 elif initial == "\\":  # continued stmt
 649                     # This yield is new; needed for better idempotency:
 650                     if stashed:
 651                         yield stashed
 652                         stashed = None
 653                     yield (NL, token, spos, (lnum, pos), line)
 654                     continued = 1
 655                 else:
 656                     if initial in "([{":
 657                         parenlev = parenlev + 1
 658                     elif initial in ")]}":
 659                         parenlev = parenlev - 1
 660                     if stashed:
 661                         yield stashed
 662                         stashed = None
 663                     yield (OP, token, spos, epos, line)
 664             else:
 665                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 666                 pos = pos + 1
 667
 668     if stashed:
 669         yield stashed
 670         stashed = None
 671
 672     for indent in indents[1:]:  # pop remaining indent levels
 673         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 674     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 675
 676
 677 if __name__ == "__main__":  # testing
 678     import sys
 679
 680     if len(sys.argv) > 1:
 681         tokenize(open(sys.argv[1]).readline)
 682     else:
 683         tokenize(sys.stdin.readline)