blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 """Tokenization help for Python programs.
   5
   6 generate_tokens(readline) is a generator that breaks a stream of
   7 text into Python tokens.  It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or "" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators
  20
  21 Older entry points
  22     tokenize_loop(readline, tokeneater)
  23     tokenize(readline, tokeneater=printtoken)
  24 are the same, except instead of generating tokens, tokeneater is a callback
  25 function to which the 5 fields described above are passed as 5 arguments,
  26 each time a new token is found."""
  27
  28 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  29 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  30
  31 import regex as re
  32 from codecs import BOM_UTF8, lookup
  33 from blib2to3.pgen2.token import *
  34
  35 from . import token
  36
  37 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  38     "tokenize",
  39     "generate_tokens",
  40     "untokenize",
  41 ]
  42 del token
  43
  44 try:
  45     bytes
  46 except NameError:
  47     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  48     # valid Python 3 code.
  49     bytes = str
  50
  51
  52 def group(*choices):
  53     return "(" + "|".join(choices) + ")"
  54
  55
  56 def any(*choices):
  57     return group(*choices) + "*"
  58
  59
  60 def maybe(*choices):
  61     return group(*choices) + "?"
  62
  63
  64 def _combinations(*l):
  65     return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
  66
  67
  68 Whitespace = r"[ \f\t]*"
  69 Comment = r"#[^\r\n]*"
  70 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  71 Name = r"\w+"  # this is invalid but it's fine because Name comes after Number in all groups
  72
  73 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
  74 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
  75 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
  76 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  77 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  78 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
  79 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
  80     Exponent
  81 )
  82 Expfloat = r"\d+(?:_\d+)*" + Exponent
  83 Floatnumber = group(Pointfloat, Expfloat)
  84 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
  85 Number = group(Imagnumber, Floatnumber, Intnumber)
  86
  87 # Tail end of ' string.
  88 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  89 # Tail end of " string.
  90 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  91 # Tail end of ''' string.
  92 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  93 # Tail end of """ string.
  94 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  95 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  96 Triple = group(_litprefix + "'''", _litprefix + '"""')
  97 # Single-line ' or " string.
  98 String = group(
  99     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 100     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 101 )
 102
 103 # Because of leftmost-then-longest match semantics, be sure to put the
 104 # longest operators first (e.g., if = came before ==, == would get
 105 # recognized as two instances of =).
 106 Operator = group(
 107     r"\*\*=?",
 108     r">>=?",
 109     r"<<=?",
 110     r"<>",
 111     r"!=",
 112     r"//=?",
 113     r"->",
 114     r"[+\-*/%&@|^=<>:]=?",
 115     r"~",
 116 )
 117
 118 Bracket = "[][(){}]"
 119 Special = group(r"\r?\n", r"[:;.,`@]")
 120 Funny = group(Operator, Bracket, Special)
 121
 122 PlainToken = group(Number, Funny, String, Name)
 123 Token = Ignore + PlainToken
 124
 125 # First (or only) line of ' or " string.
 126 ContStr = group(
 127     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 128     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 129 )
 130 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 131 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 132
 133 tokenprog = re.compile(Token, re.UNICODE)
 134 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 135 single3prog = re.compile(Single3)
 136 double3prog = re.compile(Double3)
 137
 138 _strprefixes = (
 139     _combinations("r", "R", "f", "F")
 140     | _combinations("r", "R", "b", "B")
 141     | {"u", "U", "ur", "uR", "Ur", "UR"}
 142 )
 143
 144 endprogs = {
 145     "'": re.compile(Single),
 146     '"': re.compile(Double),
 147     "'''": single3prog,
 148     '"""': double3prog,
 149     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 150     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 151     **{prefix: None for prefix in _strprefixes},
 152 }
 153
 154 triple_quoted = (
 155     {"'''", '"""'}
 156     | {f"{prefix}'''" for prefix in _strprefixes}
 157     | {f'{prefix}"""' for prefix in _strprefixes}
 158 )
 159 single_quoted = (
 160     {"'", '"'}
 161     | {f"{prefix}'" for prefix in _strprefixes}
 162     | {f'{prefix}"' for prefix in _strprefixes}
 163 )
 164
 165 tabsize = 8
 166
 167
 168 class TokenError(Exception):
 169     pass
 170
 171
 172 class StopTokenizing(Exception):
 173     pass
 174
 175
 176 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
 177     (srow, scol) = xxx_todo_changeme
 178     (erow, ecol) = xxx_todo_changeme1
 179     print(
 180         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 181     )
 182
 183
 184 def tokenize(readline, tokeneater=printtoken):
 185     """
 186     The tokenize() function accepts two parameters: one representing the
 187     input stream, and one providing an output mechanism for tokenize().
 188
 189     The first parameter, readline, must be a callable object which provides
 190     the same interface as the readline() method of built-in file objects.
 191     Each call to the function should return one line of input as a string.
 192
 193     The second parameter, tokeneater, must also be a callable object. It is
 194     called once for each token, with five arguments, corresponding to the
 195     tuples generated by generate_tokens().
 196     """
 197     try:
 198         tokenize_loop(readline, tokeneater)
 199     except StopTokenizing:
 200         pass
 201
 202
 203 # backwards compatible interface
 204 def tokenize_loop(readline, tokeneater):
 205     for token_info in generate_tokens(readline):
 206         tokeneater(*token_info)
 207
 208
 209 class Untokenizer:
 210     def __init__(self):
 211         self.tokens = []
 212         self.prev_row = 1
 213         self.prev_col = 0
 214
 215     def add_whitespace(self, start):
 216         row, col = start
 217         assert row <= self.prev_row
 218         col_offset = col - self.prev_col
 219         if col_offset:
 220             self.tokens.append(" " * col_offset)
 221
 222     def untokenize(self, iterable):
 223         for t in iterable:
 224             if len(t) == 2:
 225                 self.compat(t, iterable)
 226                 break
 227             tok_type, token, start, end, line = t
 228             self.add_whitespace(start)
 229             self.tokens.append(token)
 230             self.prev_row, self.prev_col = end
 231             if tok_type in (NEWLINE, NL):
 232                 self.prev_row += 1
 233                 self.prev_col = 0
 234         return "".join(self.tokens)
 235
 236     def compat(self, token, iterable):
 237         startline = False
 238         indents = []
 239         toks_append = self.tokens.append
 240         toknum, tokval = token
 241         if toknum in (NAME, NUMBER):
 242             tokval += " "
 243         if toknum in (NEWLINE, NL):
 244             startline = True
 245         for tok in iterable:
 246             toknum, tokval = tok[:2]
 247
 248             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 249                 tokval += " "
 250
 251             if toknum == INDENT:
 252                 indents.append(tokval)
 253                 continue
 254             elif toknum == DEDENT:
 255                 indents.pop()
 256                 continue
 257             elif toknum in (NEWLINE, NL):
 258                 startline = True
 259             elif startline and indents:
 260                 toks_append(indents[-1])
 261                 startline = False
 262             toks_append(tokval)
 263
 264
 265 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 266 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 267
 268
 269 def _get_normal_name(orig_enc):
 270     """Imitates get_normal_name in tokenizer.c."""
 271     # Only care about the first 12 characters.
 272     enc = orig_enc[:12].lower().replace("_", "-")
 273     if enc == "utf-8" or enc.startswith("utf-8-"):
 274         return "utf-8"
 275     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 276         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 277     ):
 278         return "iso-8859-1"
 279     return orig_enc
 280
 281
 282 def detect_encoding(readline):
 283     """
 284     The detect_encoding() function is used to detect the encoding that should
 285     be used to decode a Python source file. It requires one argument, readline,
 286     in the same way as the tokenize() generator.
 287
 288     It will call readline a maximum of twice, and return the encoding used
 289     (as a string) and a list of any lines (left as bytes) it has read
 290     in.
 291
 292     It detects the encoding from the presence of a utf-8 bom or an encoding
 293     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 294     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 295     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 296     'utf-8-sig' is returned.
 297
 298     If no encoding is specified, then the default of 'utf-8' will be returned.
 299     """
 300     bom_found = False
 301     encoding = None
 302     default = "utf-8"
 303
 304     def read_or_stop():
 305         try:
 306             return readline()
 307         except StopIteration:
 308             return bytes()
 309
 310     def find_cookie(line):
 311         try:
 312             line_string = line.decode("ascii")
 313         except UnicodeDecodeError:
 314             return None
 315         match = cookie_re.match(line_string)
 316         if not match:
 317             return None
 318         encoding = _get_normal_name(match.group(1))
 319         try:
 320             codec = lookup(encoding)
 321         except LookupError:
 322             # This behaviour mimics the Python interpreter
 323             raise SyntaxError("unknown encoding: " + encoding)
 324
 325         if bom_found:
 326             if codec.name != "utf-8":
 327                 # This behaviour mimics the Python interpreter
 328                 raise SyntaxError("encoding problem: utf-8")
 329             encoding += "-sig"
 330         return encoding
 331
 332     first = read_or_stop()
 333     if first.startswith(BOM_UTF8):
 334         bom_found = True
 335         first = first[3:]
 336         default = "utf-8-sig"
 337     if not first:
 338         return default, []
 339
 340     encoding = find_cookie(first)
 341     if encoding:
 342         return encoding, [first]
 343     if not blank_re.match(first):
 344         return default, [first]
 345
 346     second = read_or_stop()
 347     if not second:
 348         return default, [first]
 349
 350     encoding = find_cookie(second)
 351     if encoding:
 352         return encoding, [first, second]
 353
 354     return default, [first, second]
 355
 356
 357 def untokenize(iterable):
 358     """Transform tokens back into Python source code.
 359
 360     Each element returned by the iterable must be a token sequence
 361     with at least two elements, a token number and token value.  If
 362     only two tokens are passed, the resulting output is poor.
 363
 364     Round-trip invariant for full input:
 365         Untokenized source will match input source exactly
 366
 367     Round-trip invariant for limited intput:
 368         # Output text will tokenize the back to the input
 369         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 370         newcode = untokenize(t1)
 371         readline = iter(newcode.splitlines(1)).next
 372         t2 = [tok[:2] for tokin generate_tokens(readline)]
 373         assert t1 == t2
 374     """
 375     ut = Untokenizer()
 376     return ut.untokenize(iterable)
 377
 378
 379 def generate_tokens(readline, grammar=None):
 380     """
 381     The generate_tokens() generator requires one argument, readline, which
 382     must be a callable object which provides the same interface as the
 383     readline() method of built-in file objects. Each call to the function
 384     should return one line of input as a string.  Alternately, readline
 385     can be a callable function terminating with StopIteration:
 386         readline = open(myfile).next    # Example of alternate readline
 387
 388     The generator produces 5-tuples with these members: the token type; the
 389     token string; a 2-tuple (srow, scol) of ints specifying the row and
 390     column where the token begins in the source; a 2-tuple (erow, ecol) of
 391     ints specifying the row and column where the token ends in the source;
 392     and the line on which the token was found. The line passed is the
 393     logical line; continuation lines are included.
 394     """
 395     lnum = parenlev = continued = 0
 396     numchars = "0123456789"
 397     contstr, needcont = "", 0
 398     contline = None
 399     indents = [0]
 400
 401     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 402     # `await` as keywords.
 403     async_keywords = False if grammar is None else grammar.async_keywords
 404     # 'stashed' and 'async_*' are used for async/await parsing
 405     stashed = None
 406     async_def = False
 407     async_def_indent = 0
 408     async_def_nl = False
 409
 410     while 1:  # loop over lines in stream
 411         try:
 412             line = readline()
 413         except StopIteration:
 414             line = ""
 415         lnum = lnum + 1
 416         pos, max = 0, len(line)
 417
 418         if contstr:  # continued string
 419             if not line:
 420                 raise TokenError("EOF in multi-line string", strstart)
 421             endmatch = endprog.match(line)
 422             if endmatch:
 423                 pos = end = endmatch.end(0)
 424                 yield (
 425                     STRING,
 426                     contstr + line[:end],
 427                     strstart,
 428                     (lnum, end),
 429                     contline + line,
 430                 )
 431                 contstr, needcont = "", 0
 432                 contline = None
 433             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 434                 yield (
 435                     ERRORTOKEN,
 436                     contstr + line,
 437                     strstart,
 438                     (lnum, len(line)),
 439                     contline,
 440                 )
 441                 contstr = ""
 442                 contline = None
 443                 continue
 444             else:
 445                 contstr = contstr + line
 446                 contline = contline + line
 447                 continue
 448
 449         elif parenlev == 0 and not continued:  # new statement
 450             if not line:
 451                 break
 452             column = 0
 453             while pos < max:  # measure leading whitespace
 454                 if line[pos] == " ":
 455                     column = column + 1
 456                 elif line[pos] == "\t":
 457                     column = (column // tabsize + 1) * tabsize
 458                 elif line[pos] == "\f":
 459                     column = 0
 460                 else:
 461                     break
 462                 pos = pos + 1
 463             if pos == max:
 464                 break
 465
 466             if stashed:
 467                 yield stashed
 468                 stashed = None
 469
 470             if line[pos] in "\r\n":  # skip blank lines
 471                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 472                 continue
 473
 474             if line[pos] == "#":  # skip comments
 475                 comment_token = line[pos:].rstrip("\r\n")
 476                 nl_pos = pos + len(comment_token)
 477                 yield (
 478                     COMMENT,
 479                     comment_token,
 480                     (lnum, pos),
 481                     (lnum, pos + len(comment_token)),
 482                     line,
 483                 )
 484                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 485                 continue
 486
 487             if column > indents[-1]:  # count indents
 488                 indents.append(column)
 489                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 490
 491             while column < indents[-1]:  # count dedents
 492                 if column not in indents:
 493                     raise IndentationError(
 494                         "unindent does not match any outer indentation level",
 495                         ("<tokenize>", lnum, pos, line),
 496                     )
 497                 indents = indents[:-1]
 498
 499                 if async_def and async_def_indent >= indents[-1]:
 500                     async_def = False
 501                     async_def_nl = False
 502                     async_def_indent = 0
 503
 504                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 505
 506             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 507                 async_def = False
 508                 async_def_nl = False
 509                 async_def_indent = 0
 510
 511         else:  # continued statement
 512             if not line:
 513                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 514             continued = 0
 515
 516         while pos < max:
 517             pseudomatch = pseudoprog.match(line, pos)
 518             if pseudomatch:  # scan for tokens
 519                 start, end = pseudomatch.span(1)
 520                 spos, epos, pos = (lnum, start), (lnum, end), end
 521                 token, initial = line[start:end], line[start]
 522
 523                 if initial in numchars or (
 524                     initial == "." and token != "."
 525                 ):  # ordinary number
 526                     yield (NUMBER, token, spos, epos, line)
 527                 elif initial in "\r\n":
 528                     newline = NEWLINE
 529                     if parenlev > 0:
 530                         newline = NL
 531                     elif async_def:
 532                         async_def_nl = True
 533                     if stashed:
 534                         yield stashed
 535                         stashed = None
 536                     yield (newline, token, spos, epos, line)
 537
 538                 elif initial == "#":
 539                     assert not token.endswith("\n")
 540                     if stashed:
 541                         yield stashed
 542                         stashed = None
 543                     yield (COMMENT, token, spos, epos, line)
 544                 elif token in triple_quoted:
 545                     endprog = endprogs[token]
 546                     endmatch = endprog.match(line, pos)
 547                     if endmatch:  # all on one line
 548                         pos = endmatch.end(0)
 549                         token = line[start:pos]
 550                         if stashed:
 551                             yield stashed
 552                             stashed = None
 553                         yield (STRING, token, spos, (lnum, pos), line)
 554                     else:
 555                         strstart = (lnum, start)  # multiple lines
 556                         contstr = line[start:]
 557                         contline = line
 558                         break
 559                 elif (
 560                     initial in single_quoted
 561                     or token[:2] in single_quoted
 562                     or token[:3] in single_quoted
 563                 ):
 564                     if token[-1] == "\n":  # continued string
 565                         strstart = (lnum, start)
 566                         endprog = (
 567                             endprogs[initial]
 568                             or endprogs[token[1]]
 569                             or endprogs[token[2]]
 570                         )
 571                         contstr, needcont = line[start:], 1
 572                         contline = line
 573                         break
 574                     else:  # ordinary string
 575                         if stashed:
 576                             yield stashed
 577                             stashed = None
 578                         yield (STRING, token, spos, epos, line)
 579                 elif initial.isidentifier():  # ordinary name
 580                     if token in ("async", "await"):
 581                         if async_keywords or async_def:
 582                             yield (
 583                                 ASYNC if token == "async" else AWAIT,
 584                                 token,
 585                                 spos,
 586                                 epos,
 587                                 line,
 588                             )
 589                             continue
 590
 591                     tok = (NAME, token, spos, epos, line)
 592                     if token == "async" and not stashed:
 593                         stashed = tok
 594                         continue
 595
 596                     if token in ("def", "for"):
 597                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 598
 599                             if token == "def":
 600                                 async_def = True
 601                                 async_def_indent = indents[-1]
 602
 603                             yield (
 604                                 ASYNC,
 605                                 stashed[1],
 606                                 stashed[2],
 607                                 stashed[3],
 608                                 stashed[4],
 609                             )
 610                             stashed = None
 611
 612                     if stashed:
 613                         yield stashed
 614                         stashed = None
 615
 616                     yield tok
 617                 elif initial == "\\":  # continued stmt
 618                     # This yield is new; needed for better idempotency:
 619                     if stashed:
 620                         yield stashed
 621                         stashed = None
 622                     yield (NL, token, spos, (lnum, pos), line)
 623                     continued = 1
 624                 else:
 625                     if initial in "([{":
 626                         parenlev = parenlev + 1
 627                     elif initial in ")]}":
 628                         parenlev = parenlev - 1
 629                     if stashed:
 630                         yield stashed
 631                         stashed = None
 632                     yield (OP, token, spos, epos, line)
 633             else:
 634                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 635                 pos = pos + 1
 636
 637     if stashed:
 638         yield stashed
 639         stashed = None
 640
 641     for indent in indents[1:]:  # pop remaining indent levels
 642         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 643     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 644
 645
 646 if __name__ == "__main__":  # testing
 647     import sys
 648
 649     if len(sys.argv) > 1:
 650         tokenize(open(sys.argv[1]).readline)
 651     else:
 652         tokenize(sys.stdin.readline)