blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 """Tokenization help for Python programs.
   5
   6 generate_tokens(readline) is a generator that breaks a stream of
   7 text into Python tokens.  It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or "" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators
  20
  21 Older entry points
  22     tokenize_loop(readline, tokeneater)
  23     tokenize(readline, tokeneater=printtoken)
  24 are the same, except instead of generating tokens, tokeneater is a callback
  25 function to which the 5 fields described above are passed as 5 arguments,
  26 each time a new token is found."""
  27
  28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  29 __credits__ = \
  30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  31
  32 import re
  33 from codecs import BOM_UTF8, lookup
  34 from attr import dataclass
  35 from blib2to3.pgen2.token import *
  36
  37 from . import token
  38 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
  39            "generate_tokens", "untokenize"]
  40 del token
  41
  42 try:
  43     bytes
  44 except NameError:
  45     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  46     # valid Python 3 code.
  47     bytes = str
  48
  49 def group(*choices): return '(' + '|'.join(choices) + ')'
  50 def any(*choices): return group(*choices) + '*'
  51 def maybe(*choices): return group(*choices) + '?'
  52 def _combinations(*l):
  53     return set(
  54         x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
  55     )
  56
  57 Whitespace = r'[ \f\t]*'
  58 Comment = r'#[^\r\n]*'
  59 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  60 Name = r'\w+'  # this is invalid but it's fine because Name comes after Number in all groups
  61
  62 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  63 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  64 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
  65 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
  66 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  67 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
  68 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
  69 Expfloat = r'\d+(?:_\d+)*' + Exponent
  70 Floatnumber = group(Pointfloat, Expfloat)
  71 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
  72 Number = group(Imagnumber, Floatnumber, Intnumber)
  73
  74 # Tail end of ' string.
  75 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  76 # Tail end of " string.
  77 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  78 # Tail end of ''' string.
  79 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  80 # Tail end of """ string.
  81 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  82 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  83 Triple = group(_litprefix + "'''", _litprefix + '"""')
  84 # Single-line ' or " string.
  85 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  86                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  87
  88 # Because of leftmost-then-longest match semantics, be sure to put the
  89 # longest operators first (e.g., if = came before ==, == would get
  90 # recognized as two instances of =).
  91 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  92                  r"//=?", r"->",
  93                  r"[+\-*/%&@|^=<>]=?",
  94                  r"~")
  95
  96 Bracket = '[][(){}]'
  97 Special = group(r'\r?\n', r'[:;.,`@]')
  98 Funny = group(Operator, Bracket, Special)
  99
 100 PlainToken = group(Number, Funny, String, Name)
 101 Token = Ignore + PlainToken
 102
 103 # First (or only) line of ' or " string.
 104 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 105                 group("'", r'\\\r?\n'),
 106                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 107                 group('"', r'\\\r?\n'))
 108 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 109 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 110
 111 tokenprog = re.compile(Token, re.UNICODE)
 112 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 113 single3prog = re.compile(Single3)
 114 double3prog = re.compile(Double3)
 115
 116 _strprefixes = (
 117     _combinations('r', 'R', 'f', 'F') |
 118     _combinations('r', 'R', 'b', 'B') |
 119     {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
 120 )
 121
 122 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 123             "'''": single3prog, '"""': double3prog,
 124             **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 125             **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 126             **{prefix: None for prefix in _strprefixes}}
 127
 128 triple_quoted = (
 129     {"'''", '"""'} |
 130     {f"{prefix}'''" for prefix in _strprefixes} |
 131     {f'{prefix}"""' for prefix in _strprefixes}
 132 )
 133 single_quoted = (
 134     {"'", '"'} |
 135     {f"{prefix}'" for prefix in _strprefixes} |
 136     {f'{prefix}"' for prefix in _strprefixes}
 137 )
 138
 139 tabsize = 8
 140
 141 @dataclass(frozen=True)
 142 class TokenizerConfig:
 143     async_is_reserved_keyword: bool = False
 144
 145 class TokenError(Exception): pass
 146
 147 class StopTokenizing(Exception): pass
 148
 149 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
 150     (srow, scol) = xxx_todo_changeme
 151     (erow, ecol) = xxx_todo_changeme1
 152     print("%d,%d-%d,%d:\t%s\t%s" % \
 153         (srow, scol, erow, ecol, tok_name[type], repr(token)))
 154
 155 def tokenize(readline, tokeneater=printtoken):
 156     """
 157     The tokenize() function accepts two parameters: one representing the
 158     input stream, and one providing an output mechanism for tokenize().
 159
 160     The first parameter, readline, must be a callable object which provides
 161     the same interface as the readline() method of built-in file objects.
 162     Each call to the function should return one line of input as a string.
 163
 164     The second parameter, tokeneater, must also be a callable object. It is
 165     called once for each token, with five arguments, corresponding to the
 166     tuples generated by generate_tokens().
 167     """
 168     try:
 169         tokenize_loop(readline, tokeneater)
 170     except StopTokenizing:
 171         pass
 172
 173 # backwards compatible interface
 174 def tokenize_loop(readline, tokeneater):
 175     for token_info in generate_tokens(readline):
 176         tokeneater(*token_info)
 177
 178 class Untokenizer:
 179
 180     def __init__(self):
 181         self.tokens = []
 182         self.prev_row = 1
 183         self.prev_col = 0
 184
 185     def add_whitespace(self, start):
 186         row, col = start
 187         assert row <= self.prev_row
 188         col_offset = col - self.prev_col
 189         if col_offset:
 190             self.tokens.append(" " * col_offset)
 191
 192     def untokenize(self, iterable):
 193         for t in iterable:
 194             if len(t) == 2:
 195                 self.compat(t, iterable)
 196                 break
 197             tok_type, token, start, end, line = t
 198             self.add_whitespace(start)
 199             self.tokens.append(token)
 200             self.prev_row, self.prev_col = end
 201             if tok_type in (NEWLINE, NL):
 202                 self.prev_row += 1
 203                 self.prev_col = 0
 204         return "".join(self.tokens)
 205
 206     def compat(self, token, iterable):
 207         startline = False
 208         indents = []
 209         toks_append = self.tokens.append
 210         toknum, tokval = token
 211         if toknum in (NAME, NUMBER):
 212             tokval += ' '
 213         if toknum in (NEWLINE, NL):
 214             startline = True
 215         for tok in iterable:
 216             toknum, tokval = tok[:2]
 217
 218             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 219                 tokval += ' '
 220
 221             if toknum == INDENT:
 222                 indents.append(tokval)
 223                 continue
 224             elif toknum == DEDENT:
 225                 indents.pop()
 226                 continue
 227             elif toknum in (NEWLINE, NL):
 228                 startline = True
 229             elif startline and indents:
 230                 toks_append(indents[-1])
 231                 startline = False
 232             toks_append(tokval)
 233
 234 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 235 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 236
 237 def _get_normal_name(orig_enc):
 238     """Imitates get_normal_name in tokenizer.c."""
 239     # Only care about the first 12 characters.
 240     enc = orig_enc[:12].lower().replace("_", "-")
 241     if enc == "utf-8" or enc.startswith("utf-8-"):
 242         return "utf-8"
 243     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 244        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 245         return "iso-8859-1"
 246     return orig_enc
 247
 248 def detect_encoding(readline):
 249     """
 250     The detect_encoding() function is used to detect the encoding that should
 251     be used to decode a Python source file. It requires one argument, readline,
 252     in the same way as the tokenize() generator.
 253
 254     It will call readline a maximum of twice, and return the encoding used
 255     (as a string) and a list of any lines (left as bytes) it has read
 256     in.
 257
 258     It detects the encoding from the presence of a utf-8 bom or an encoding
 259     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 260     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 261     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 262     'utf-8-sig' is returned.
 263
 264     If no encoding is specified, then the default of 'utf-8' will be returned.
 265     """
 266     bom_found = False
 267     encoding = None
 268     default = 'utf-8'
 269     def read_or_stop():
 270         try:
 271             return readline()
 272         except StopIteration:
 273             return bytes()
 274
 275     def find_cookie(line):
 276         try:
 277             line_string = line.decode('ascii')
 278         except UnicodeDecodeError:
 279             return None
 280         match = cookie_re.match(line_string)
 281         if not match:
 282             return None
 283         encoding = _get_normal_name(match.group(1))
 284         try:
 285             codec = lookup(encoding)
 286         except LookupError:
 287             # This behaviour mimics the Python interpreter
 288             raise SyntaxError("unknown encoding: " + encoding)
 289
 290         if bom_found:
 291             if codec.name != 'utf-8':
 292                 # This behaviour mimics the Python interpreter
 293                 raise SyntaxError('encoding problem: utf-8')
 294             encoding += '-sig'
 295         return encoding
 296
 297     first = read_or_stop()
 298     if first.startswith(BOM_UTF8):
 299         bom_found = True
 300         first = first[3:]
 301         default = 'utf-8-sig'
 302     if not first:
 303         return default, []
 304
 305     encoding = find_cookie(first)
 306     if encoding:
 307         return encoding, [first]
 308     if not blank_re.match(first):
 309         return default, [first]
 310
 311     second = read_or_stop()
 312     if not second:
 313         return default, [first]
 314
 315     encoding = find_cookie(second)
 316     if encoding:
 317         return encoding, [first, second]
 318
 319     return default, [first, second]
 320
 321 def untokenize(iterable):
 322     """Transform tokens back into Python source code.
 323
 324     Each element returned by the iterable must be a token sequence
 325     with at least two elements, a token number and token value.  If
 326     only two tokens are passed, the resulting output is poor.
 327
 328     Round-trip invariant for full input:
 329         Untokenized source will match input source exactly
 330
 331     Round-trip invariant for limited intput:
 332         # Output text will tokenize the back to the input
 333         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 334         newcode = untokenize(t1)
 335         readline = iter(newcode.splitlines(1)).next
 336         t2 = [tok[:2] for tokin generate_tokens(readline)]
 337         assert t1 == t2
 338     """
 339     ut = Untokenizer()
 340     return ut.untokenize(iterable)
 341
 342 def generate_tokens(readline, config: TokenizerConfig = TokenizerConfig()):
 343     """
 344     The generate_tokens() generator requires one argument, readline, which
 345     must be a callable object which provides the same interface as the
 346     readline() method of built-in file objects. Each call to the function
 347     should return one line of input as a string.  Alternately, readline
 348     can be a callable function terminating with StopIteration:
 349         readline = open(myfile).next    # Example of alternate readline
 350
 351     The generator produces 5-tuples with these members: the token type; the
 352     token string; a 2-tuple (srow, scol) of ints specifying the row and
 353     column where the token begins in the source; a 2-tuple (erow, ecol) of
 354     ints specifying the row and column where the token ends in the source;
 355     and the line on which the token was found. The line passed is the
 356     logical line; continuation lines are included.
 357     """
 358     lnum = parenlev = continued = 0
 359     numchars = '0123456789'
 360     contstr, needcont = '', 0
 361     contline = None
 362     indents = [0]
 363
 364     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 365     # `await` as keywords.
 366     async_is_reserved_keyword = config.async_is_reserved_keyword
 367     # 'stashed' and 'async_*' are used for async/await parsing
 368     stashed = None
 369     async_def = False
 370     async_def_indent = 0
 371     async_def_nl = False
 372
 373     while 1:                                   # loop over lines in stream
 374         try:
 375             line = readline()
 376         except StopIteration:
 377             line = ''
 378         lnum = lnum + 1
 379         pos, max = 0, len(line)
 380
 381         if contstr:                            # continued string
 382             if not line:
 383                 raise TokenError("EOF in multi-line string", strstart)
 384             endmatch = endprog.match(line)
 385             if endmatch:
 386                 pos = end = endmatch.end(0)
 387                 yield (STRING, contstr + line[:end],
 388                        strstart, (lnum, end), contline + line)
 389                 contstr, needcont = '', 0
 390                 contline = None
 391             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 392                 yield (ERRORTOKEN, contstr + line,
 393                            strstart, (lnum, len(line)), contline)
 394                 contstr = ''
 395                 contline = None
 396                 continue
 397             else:
 398                 contstr = contstr + line
 399                 contline = contline + line
 400                 continue
 401
 402         elif parenlev == 0 and not continued:  # new statement
 403             if not line: break
 404             column = 0
 405             while pos < max:                   # measure leading whitespace
 406                 if line[pos] == ' ': column = column + 1
 407                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
 408                 elif line[pos] == '\f': column = 0
 409                 else: break
 410                 pos = pos + 1
 411             if pos == max: break
 412
 413             if stashed:
 414                 yield stashed
 415                 stashed = None
 416
 417             if line[pos] in '\r\n':            # skip blank lines
 418                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 419                 continue
 420
 421             if line[pos] == '#':               # skip comments
 422                 comment_token = line[pos:].rstrip('\r\n')
 423                 nl_pos = pos + len(comment_token)
 424                 yield (COMMENT, comment_token,
 425                         (lnum, pos), (lnum, pos + len(comment_token)), line)
 426                 yield (NL, line[nl_pos:],
 427                         (lnum, nl_pos), (lnum, len(line)), line)
 428                 continue
 429
 430             if column > indents[-1]:           # count indents
 431                 indents.append(column)
 432                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 433
 434             while column < indents[-1]:        # count dedents
 435                 if column not in indents:
 436                     raise IndentationError(
 437                         "unindent does not match any outer indentation level",
 438                         ("<tokenize>", lnum, pos, line))
 439                 indents = indents[:-1]
 440
 441                 if async_def and async_def_indent >= indents[-1]:
 442                     async_def = False
 443                     async_def_nl = False
 444                     async_def_indent = 0
 445
 446                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 447
 448             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 449                 async_def = False
 450                 async_def_nl = False
 451                 async_def_indent = 0
 452
 453         else:                                  # continued statement
 454             if not line:
 455                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 456             continued = 0
 457
 458         while pos < max:
 459             pseudomatch = pseudoprog.match(line, pos)
 460             if pseudomatch:                                # scan for tokens
 461                 start, end = pseudomatch.span(1)
 462                 spos, epos, pos = (lnum, start), (lnum, end), end
 463                 token, initial = line[start:end], line[start]
 464
 465                 if initial in numchars or \
 466                    (initial == '.' and token != '.'):      # ordinary number
 467                     yield (NUMBER, token, spos, epos, line)
 468                 elif initial in '\r\n':
 469                     newline = NEWLINE
 470                     if parenlev > 0:
 471                         newline = NL
 472                     elif async_def:
 473                         async_def_nl = True
 474                     if stashed:
 475                         yield stashed
 476                         stashed = None
 477                     yield (newline, token, spos, epos, line)
 478
 479                 elif initial == '#':
 480                     assert not token.endswith("\n")
 481                     if stashed:
 482                         yield stashed
 483                         stashed = None
 484                     yield (COMMENT, token, spos, epos, line)
 485                 elif token in triple_quoted:
 486                     endprog = endprogs[token]
 487                     endmatch = endprog.match(line, pos)
 488                     if endmatch:                           # all on one line
 489                         pos = endmatch.end(0)
 490                         token = line[start:pos]
 491                         if stashed:
 492                             yield stashed
 493                             stashed = None
 494                         yield (STRING, token, spos, (lnum, pos), line)
 495                     else:
 496                         strstart = (lnum, start)           # multiple lines
 497                         contstr = line[start:]
 498                         contline = line
 499                         break
 500                 elif initial in single_quoted or \
 501                     token[:2] in single_quoted or \
 502                     token[:3] in single_quoted:
 503                     if token[-1] == '\n':                  # continued string
 504                         strstart = (lnum, start)
 505                         endprog = (endprogs[initial] or endprogs[token[1]] or
 506                                    endprogs[token[2]])
 507                         contstr, needcont = line[start:], 1
 508                         contline = line
 509                         break
 510                     else:                                  # ordinary string
 511                         if stashed:
 512                             yield stashed
 513                             stashed = None
 514                         yield (STRING, token, spos, epos, line)
 515                 elif initial.isidentifier():               # ordinary name
 516                     if token in ('async', 'await'):
 517                         if async_is_reserved_keyword or async_def:
 518                             yield (ASYNC if token == 'async' else AWAIT,
 519                                    token, spos, epos, line)
 520                             continue
 521
 522                     tok = (NAME, token, spos, epos, line)
 523                     if token == 'async' and not stashed:
 524                         stashed = tok
 525                         continue
 526
 527                     if token in ('def', 'for'):
 528                         if (stashed
 529                                 and stashed[0] == NAME
 530                                 and stashed[1] == 'async'):
 531
 532                             if token == 'def':
 533                                 async_def = True
 534                                 async_def_indent = indents[-1]
 535
 536                             yield (ASYNC, stashed[1],
 537                                    stashed[2], stashed[3],
 538                                    stashed[4])
 539                             stashed = None
 540
 541                     if stashed:
 542                         yield stashed
 543                         stashed = None
 544
 545                     yield tok
 546                 elif initial == '\\':                      # continued stmt
 547                     # This yield is new; needed for better idempotency:
 548                     if stashed:
 549                         yield stashed
 550                         stashed = None
 551                     yield (NL, token, spos, (lnum, pos), line)
 552                     continued = 1
 553                 else:
 554                     if initial in '([{': parenlev = parenlev + 1
 555                     elif initial in ')]}': parenlev = parenlev - 1
 556                     if stashed:
 557                         yield stashed
 558                         stashed = None
 559                     yield (OP, token, spos, epos, line)
 560             else:
 561                 yield (ERRORTOKEN, line[pos],
 562                            (lnum, pos), (lnum, pos+1), line)
 563                 pos = pos + 1
 564
 565     if stashed:
 566         yield stashed
 567         stashed = None
 568
 569     for indent in indents[1:]:                 # pop remaining indent levels
 570         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 571     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 572
 573 if __name__ == '__main__':                     # testing
 574     import sys
 575     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 576     else: tokenize(sys.stdin.readline)