blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 """Tokenization help for Python programs.
   5
   6 generate_tokens(readline) is a generator that breaks a stream of
   7 text into Python tokens.  It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or "" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators
  20
  21 Older entry points
  22     tokenize_loop(readline, tokeneater)
  23     tokenize(readline, tokeneater=printtoken)
  24 are the same, except instead of generating tokens, tokeneater is a callback
  25 function to which the 5 fields described above are passed as 5 arguments,
  26 each time a new token is found."""
  27
  28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  29 __credits__ = \
  30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  31
  32 import string, re
  33 from codecs import BOM_UTF8, lookup
  34 from lib2to3.pgen2.token import *
  35
  36 from . import token
  37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
  38            "generate_tokens", "untokenize"]
  39 del token
  40
  41 try:
  42     bytes
  43 except NameError:
  44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  45     # valid Python 3 code.
  46     bytes = str
  47
  48 def group(*choices): return '(' + '|'.join(choices) + ')'
  49 def any(*choices): return group(*choices) + '*'
  50 def maybe(*choices): return group(*choices) + '?'
  51
  52 Whitespace = r'[ \f\t]*'
  53 Comment = r'#[^\r\n]*'
  54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  55 Name = r'[a-zA-Z_]\w*'
  56
  57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
  60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
  61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
  63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
  64 Expfloat = r'\d+(?:_\d+)*' + Exponent
  65 Floatnumber = group(Pointfloat, Expfloat)
  66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
  67 Number = group(Imagnumber, Floatnumber, Intnumber)
  68
  69 # Tail end of ' string.
  70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  71 # Tail end of " string.
  72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  73 # Tail end of ''' string.
  74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  75 # Tail end of """ string.
  76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
  78 Triple = group(_litprefix + "'''", _litprefix + '"""')
  79 # Single-line ' or " string.
  80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  81                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  82
  83 # Because of leftmost-then-longest match semantics, be sure to put the
  84 # longest operators first (e.g., if = came before ==, == would get
  85 # recognized as two instances of =).
  86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  87                  r"//=?", r"->",
  88                  r"[+\-*/%&@|^=<>]=?",
  89                  r"~")
  90
  91 Bracket = '[][(){}]'
  92 Special = group(r'\r?\n', r'[:;.,`@]')
  93 Funny = group(Operator, Bracket, Special)
  94
  95 PlainToken = group(Number, Funny, String, Name)
  96 Token = Ignore + PlainToken
  97
  98 # First (or only) line of ' or " string.
  99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 100                 group("'", r'\\\r?\n'),
 101                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 102                 group('"', r'\\\r?\n'))
 103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 105
 106 tokenprog, pseudoprog, single3prog, double3prog = list(map(
 107     re.compile, (Token, PseudoToken, Single3, Double3)))
 108 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 109             "'''": single3prog, '"""': double3prog,
 110             "r'''": single3prog, 'r"""': double3prog,
 111             "u'''": single3prog, 'u"""': double3prog,
 112             "b'''": single3prog, 'b"""': double3prog,
 113             "f'''": single3prog, 'f"""': double3prog,
 114             "ur'''": single3prog, 'ur"""': double3prog,
 115             "br'''": single3prog, 'br"""': double3prog,
 116             "rb'''": single3prog, 'rb"""': double3prog,
 117             "R'''": single3prog, 'R"""': double3prog,
 118             "U'''": single3prog, 'U"""': double3prog,
 119             "B'''": single3prog, 'B"""': double3prog,
 120             "F'''": single3prog, 'F"""': double3prog,
 121             "uR'''": single3prog, 'uR"""': double3prog,
 122             "Ur'''": single3prog, 'Ur"""': double3prog,
 123             "UR'''": single3prog, 'UR"""': double3prog,
 124             "bR'''": single3prog, 'bR"""': double3prog,
 125             "Br'''": single3prog, 'Br"""': double3prog,
 126             "BR'''": single3prog, 'BR"""': double3prog,
 127             "rB'''": single3prog, 'rB"""': double3prog,
 128             "Rb'''": single3prog, 'Rb"""': double3prog,
 129             "RB'''": single3prog, 'RB"""': double3prog,
 130             'r': None, 'R': None,
 131             'u': None, 'U': None,
 132             'f': None, 'F': None,
 133             'b': None, 'B': None}
 134
 135 triple_quoted = {}
 136 for t in ("'''", '"""',
 137           "r'''", 'r"""', "R'''", 'R"""',
 138           "u'''", 'u"""', "U'''", 'U"""',
 139           "b'''", 'b"""', "B'''", 'B"""',
 140           "f'''", 'f"""', "F'''", 'F"""',
 141           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 142           "uR'''", 'uR"""', "UR'''", 'UR"""',
 143           "br'''", 'br"""', "Br'''", 'Br"""',
 144           "bR'''", 'bR"""', "BR'''", 'BR"""',
 145           "rb'''", 'rb"""', "Rb'''", 'Rb"""',
 146           "rB'''", 'rB"""', "RB'''", 'RB"""',):
 147     triple_quoted[t] = t
 148 single_quoted = {}
 149 for t in ("'", '"',
 150           "r'", 'r"', "R'", 'R"',
 151           "u'", 'u"', "U'", 'U"',
 152           "b'", 'b"', "B'", 'B"',
 153           "f'", 'f"', "F'", 'F"',
 154           "ur'", 'ur"', "Ur'", 'Ur"',
 155           "uR'", 'uR"', "UR'", 'UR"',
 156           "br'", 'br"', "Br'", 'Br"',
 157           "bR'", 'bR"', "BR'", 'BR"',
 158           "rb'", 'rb"', "Rb'", 'Rb"',
 159           "rB'", 'rB"', "RB'", 'RB"',):
 160     single_quoted[t] = t
 161
 162 tabsize = 8
 163
 164 class TokenError(Exception): pass
 165
 166 class StopTokenizing(Exception): pass
 167
 168 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
 169     (srow, scol) = xxx_todo_changeme
 170     (erow, ecol) = xxx_todo_changeme1
 171     print("%d,%d-%d,%d:\t%s\t%s" % \
 172         (srow, scol, erow, ecol, tok_name[type], repr(token)))
 173
 174 def tokenize(readline, tokeneater=printtoken):
 175     """
 176     The tokenize() function accepts two parameters: one representing the
 177     input stream, and one providing an output mechanism for tokenize().
 178
 179     The first parameter, readline, must be a callable object which provides
 180     the same interface as the readline() method of built-in file objects.
 181     Each call to the function should return one line of input as a string.
 182
 183     The second parameter, tokeneater, must also be a callable object. It is
 184     called once for each token, with five arguments, corresponding to the
 185     tuples generated by generate_tokens().
 186     """
 187     try:
 188         tokenize_loop(readline, tokeneater)
 189     except StopTokenizing:
 190         pass
 191
 192 # backwards compatible interface
 193 def tokenize_loop(readline, tokeneater):
 194     for token_info in generate_tokens(readline):
 195         tokeneater(*token_info)
 196
 197 class Untokenizer:
 198
 199     def __init__(self):
 200         self.tokens = []
 201         self.prev_row = 1
 202         self.prev_col = 0
 203
 204     def add_whitespace(self, start):
 205         row, col = start
 206         assert row <= self.prev_row
 207         col_offset = col - self.prev_col
 208         if col_offset:
 209             self.tokens.append(" " * col_offset)
 210
 211     def untokenize(self, iterable):
 212         for t in iterable:
 213             if len(t) == 2:
 214                 self.compat(t, iterable)
 215                 break
 216             tok_type, token, start, end, line = t
 217             self.add_whitespace(start)
 218             self.tokens.append(token)
 219             self.prev_row, self.prev_col = end
 220             if tok_type in (NEWLINE, NL):
 221                 self.prev_row += 1
 222                 self.prev_col = 0
 223         return "".join(self.tokens)
 224
 225     def compat(self, token, iterable):
 226         startline = False
 227         indents = []
 228         toks_append = self.tokens.append
 229         toknum, tokval = token
 230         if toknum in (NAME, NUMBER):
 231             tokval += ' '
 232         if toknum in (NEWLINE, NL):
 233             startline = True
 234         for tok in iterable:
 235             toknum, tokval = tok[:2]
 236
 237             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 238                 tokval += ' '
 239
 240             if toknum == INDENT:
 241                 indents.append(tokval)
 242                 continue
 243             elif toknum == DEDENT:
 244                 indents.pop()
 245                 continue
 246             elif toknum in (NEWLINE, NL):
 247                 startline = True
 248             elif startline and indents:
 249                 toks_append(indents[-1])
 250                 startline = False
 251             toks_append(tokval)
 252
 253 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 254 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 255
 256 def _get_normal_name(orig_enc):
 257     """Imitates get_normal_name in tokenizer.c."""
 258     # Only care about the first 12 characters.
 259     enc = orig_enc[:12].lower().replace("_", "-")
 260     if enc == "utf-8" or enc.startswith("utf-8-"):
 261         return "utf-8"
 262     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 263        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 264         return "iso-8859-1"
 265     return orig_enc
 266
 267 def detect_encoding(readline):
 268     """
 269     The detect_encoding() function is used to detect the encoding that should
 270     be used to decode a Python source file. It requires one argument, readline,
 271     in the same way as the tokenize() generator.
 272
 273     It will call readline a maximum of twice, and return the encoding used
 274     (as a string) and a list of any lines (left as bytes) it has read
 275     in.
 276
 277     It detects the encoding from the presence of a utf-8 bom or an encoding
 278     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 279     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 280     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 281     'utf-8-sig' is returned.
 282
 283     If no encoding is specified, then the default of 'utf-8' will be returned.
 284     """
 285     bom_found = False
 286     encoding = None
 287     default = 'utf-8'
 288     def read_or_stop():
 289         try:
 290             return readline()
 291         except StopIteration:
 292             return bytes()
 293
 294     def find_cookie(line):
 295         try:
 296             line_string = line.decode('ascii')
 297         except UnicodeDecodeError:
 298             return None
 299         match = cookie_re.match(line_string)
 300         if not match:
 301             return None
 302         encoding = _get_normal_name(match.group(1))
 303         try:
 304             codec = lookup(encoding)
 305         except LookupError:
 306             # This behaviour mimics the Python interpreter
 307             raise SyntaxError("unknown encoding: " + encoding)
 308
 309         if bom_found:
 310             if codec.name != 'utf-8':
 311                 # This behaviour mimics the Python interpreter
 312                 raise SyntaxError('encoding problem: utf-8')
 313             encoding += '-sig'
 314         return encoding
 315
 316     first = read_or_stop()
 317     if first.startswith(BOM_UTF8):
 318         bom_found = True
 319         first = first[3:]
 320         default = 'utf-8-sig'
 321     if not first:
 322         return default, []
 323
 324     encoding = find_cookie(first)
 325     if encoding:
 326         return encoding, [first]
 327     if not blank_re.match(first):
 328         return default, [first]
 329
 330     second = read_or_stop()
 331     if not second:
 332         return default, [first]
 333
 334     encoding = find_cookie(second)
 335     if encoding:
 336         return encoding, [first, second]
 337
 338     return default, [first, second]
 339
 340 def untokenize(iterable):
 341     """Transform tokens back into Python source code.
 342
 343     Each element returned by the iterable must be a token sequence
 344     with at least two elements, a token number and token value.  If
 345     only two tokens are passed, the resulting output is poor.
 346
 347     Round-trip invariant for full input:
 348         Untokenized source will match input source exactly
 349
 350     Round-trip invariant for limited intput:
 351         # Output text will tokenize the back to the input
 352         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 353         newcode = untokenize(t1)
 354         readline = iter(newcode.splitlines(1)).next
 355         t2 = [tok[:2] for tokin generate_tokens(readline)]
 356         assert t1 == t2
 357     """
 358     ut = Untokenizer()
 359     return ut.untokenize(iterable)
 360
 361 def generate_tokens(readline):
 362     """
 363     The generate_tokens() generator requires one argument, readline, which
 364     must be a callable object which provides the same interface as the
 365     readline() method of built-in file objects. Each call to the function
 366     should return one line of input as a string.  Alternately, readline
 367     can be a callable function terminating with StopIteration:
 368         readline = open(myfile).next    # Example of alternate readline
 369
 370     The generator produces 5-tuples with these members: the token type; the
 371     token string; a 2-tuple (srow, scol) of ints specifying the row and
 372     column where the token begins in the source; a 2-tuple (erow, ecol) of
 373     ints specifying the row and column where the token ends in the source;
 374     and the line on which the token was found. The line passed is the
 375     logical line; continuation lines are included.
 376     """
 377     lnum = parenlev = continued = 0
 378     namechars, numchars = string.ascii_letters + '_', '0123456789'
 379     contstr, needcont = '', 0
 380     contline = None
 381     indents = [0]
 382
 383     # 'stashed' and 'async_*' are used for async/await parsing
 384     stashed = None
 385     async_def = False
 386     async_def_indent = 0
 387     async_def_nl = False
 388
 389     while 1:                                   # loop over lines in stream
 390         try:
 391             line = readline()
 392         except StopIteration:
 393             line = ''
 394         lnum = lnum + 1
 395         pos, max = 0, len(line)
 396
 397         if contstr:                            # continued string
 398             if not line:
 399                 raise TokenError("EOF in multi-line string", strstart)
 400             endmatch = endprog.match(line)
 401             if endmatch:
 402                 pos = end = endmatch.end(0)
 403                 yield (STRING, contstr + line[:end],
 404                        strstart, (lnum, end), contline + line)
 405                 contstr, needcont = '', 0
 406                 contline = None
 407             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 408                 yield (ERRORTOKEN, contstr + line,
 409                            strstart, (lnum, len(line)), contline)
 410                 contstr = ''
 411                 contline = None
 412                 continue
 413             else:
 414                 contstr = contstr + line
 415                 contline = contline + line
 416                 continue
 417
 418         elif parenlev == 0 and not continued:  # new statement
 419             if not line: break
 420             column = 0
 421             while pos < max:                   # measure leading whitespace
 422                 if line[pos] == ' ': column = column + 1
 423                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
 424                 elif line[pos] == '\f': column = 0
 425                 else: break
 426                 pos = pos + 1
 427             if pos == max: break
 428
 429             if stashed:
 430                 yield stashed
 431                 stashed = None
 432
 433             if line[pos] in '#\r\n':           # skip comments or blank lines
 434                 if line[pos] == '#':
 435                     comment_token = line[pos:].rstrip('\r\n')
 436                     nl_pos = pos + len(comment_token)
 437                     yield (COMMENT, comment_token,
 438                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 439                     yield (NL, line[nl_pos:],
 440                            (lnum, nl_pos), (lnum, len(line)), line)
 441                 else:
 442                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 443                            (lnum, pos), (lnum, len(line)), line)
 444                 continue
 445
 446             if column > indents[-1]:           # count indents or dedents
 447                 indents.append(column)
 448                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 449             while column < indents[-1]:
 450                 if column not in indents:
 451                     raise IndentationError(
 452                         "unindent does not match any outer indentation level",
 453                         ("<tokenize>", lnum, pos, line))
 454                 indents = indents[:-1]
 455
 456                 if async_def and async_def_indent >= indents[-1]:
 457                     async_def = False
 458                     async_def_nl = False
 459                     async_def_indent = 0
 460
 461                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 462
 463             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 464                 async_def = False
 465                 async_def_nl = False
 466                 async_def_indent = 0
 467
 468         else:                                  # continued statement
 469             if not line:
 470                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 471             continued = 0
 472
 473         while pos < max:
 474             pseudomatch = pseudoprog.match(line, pos)
 475             if pseudomatch:                                # scan for tokens
 476                 start, end = pseudomatch.span(1)
 477                 spos, epos, pos = (lnum, start), (lnum, end), end
 478                 token, initial = line[start:end], line[start]
 479
 480                 if initial in numchars or \
 481                    (initial == '.' and token != '.'):      # ordinary number
 482                     yield (NUMBER, token, spos, epos, line)
 483                 elif initial in '\r\n':
 484                     newline = NEWLINE
 485                     if parenlev > 0:
 486                         newline = NL
 487                     elif async_def:
 488                         async_def_nl = True
 489                     if stashed:
 490                         yield stashed
 491                         stashed = None
 492                     yield (newline, token, spos, epos, line)
 493
 494                 elif initial == '#':
 495                     assert not token.endswith("\n")
 496                     if stashed:
 497                         yield stashed
 498                         stashed = None
 499                     yield (COMMENT, token, spos, epos, line)
 500                 elif token in triple_quoted:
 501                     endprog = endprogs[token]
 502                     endmatch = endprog.match(line, pos)
 503                     if endmatch:                           # all on one line
 504                         pos = endmatch.end(0)
 505                         token = line[start:pos]
 506                         if stashed:
 507                             yield stashed
 508                             stashed = None
 509                         yield (STRING, token, spos, (lnum, pos), line)
 510                     else:
 511                         strstart = (lnum, start)           # multiple lines
 512                         contstr = line[start:]
 513                         contline = line
 514                         break
 515                 elif initial in single_quoted or \
 516                     token[:2] in single_quoted or \
 517                     token[:3] in single_quoted:
 518                     if token[-1] == '\n':                  # continued string
 519                         strstart = (lnum, start)
 520                         endprog = (endprogs[initial] or endprogs[token[1]] or
 521                                    endprogs[token[2]])
 522                         contstr, needcont = line[start:], 1
 523                         contline = line
 524                         break
 525                     else:                                  # ordinary string
 526                         if stashed:
 527                             yield stashed
 528                             stashed = None
 529                         yield (STRING, token, spos, epos, line)
 530                 elif initial in namechars:                 # ordinary name
 531                     if token in ('async', 'await'):
 532                         if async_def:
 533                             yield (ASYNC if token == 'async' else AWAIT,
 534                                    token, spos, epos, line)
 535                             continue
 536
 537                     tok = (NAME, token, spos, epos, line)
 538                     if token == 'async' and not stashed:
 539                         stashed = tok
 540                         continue
 541
 542                     if token == 'def':
 543                         if (stashed
 544                                 and stashed[0] == NAME
 545                                 and stashed[1] == 'async'):
 546
 547                             async_def = True
 548                             async_def_indent = indents[-1]
 549
 550                             yield (ASYNC, stashed[1],
 551                                    stashed[2], stashed[3],
 552                                    stashed[4])
 553                             stashed = None
 554
 555                     if stashed:
 556                         yield stashed
 557                         stashed = None
 558
 559                     yield tok
 560                 elif initial == '\\':                      # continued stmt
 561                     # This yield is new; needed for better idempotency:
 562                     if stashed:
 563                         yield stashed
 564                         stashed = None
 565                     yield (NL, token, spos, (lnum, pos), line)
 566                     continued = 1
 567                 else:
 568                     if initial in '([{': parenlev = parenlev + 1
 569                     elif initial in ')]}': parenlev = parenlev - 1
 570                     if stashed:
 571                         yield stashed
 572                         stashed = None
 573                     yield (OP, token, spos, epos, line)
 574             else:
 575                 yield (ERRORTOKEN, line[pos],
 576                            (lnum, pos), (lnum, pos+1), line)
 577                 pos = pos + 1
 578
 579     if stashed:
 580         yield stashed
 581         stashed = None
 582
 583     for indent in indents[1:]:                 # pop remaining indent levels
 584         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 585     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 586
 587 if __name__ == '__main__':                     # testing
 588     import sys
 589     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 590     else: tokenize(sys.stdin.readline)