blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 """Tokenization help for Python programs.
   5
   6 generate_tokens(readline) is a generator that breaks a stream of
   7 text into Python tokens.  It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or "" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators
  20
  21 Older entry points
  22     tokenize_loop(readline, tokeneater)
  23     tokenize(readline, tokeneater=printtoken)
  24 are the same, except instead of generating tokens, tokeneater is a callback
  25 function to which the 5 fields described above are passed as 5 arguments,
  26 each time a new token is found."""
  27
  28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  29 __credits__ = \
  30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  31
  32 import string, re, unicodedata
  33 from codecs import BOM_UTF8, lookup
  34 from blib2to3.pgen2.token import *
  35
  36 from . import token
  37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
  38            "generate_tokens", "untokenize"]
  39 del token
  40
  41 try:
  42     bytes
  43 except NameError:
  44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  45     # valid Python 3 code.
  46     bytes = str
  47
  48 def group(*choices): return '(' + '|'.join(choices) + ')'
  49 def any(*choices): return group(*choices) + '*'
  50 def maybe(*choices): return group(*choices) + '?'
  51
  52 Whitespace = r'[ \f\t]*'
  53 Comment = r'#[^\r\n]*'
  54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  55 Name = r'[^\d\W]\w*'
  56
  57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
  60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
  61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
  63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
  64 Expfloat = r'\d+(?:_\d+)*' + Exponent
  65 Floatnumber = group(Pointfloat, Expfloat)
  66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
  67 Number = group(Imagnumber, Floatnumber, Intnumber)
  68
  69 # Tail end of ' string.
  70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  71 # Tail end of " string.
  72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  73 # Tail end of ''' string.
  74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  75 # Tail end of """ string.
  76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
  78 Triple = group(_litprefix + "'''", _litprefix + '"""')
  79 # Single-line ' or " string.
  80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  81                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  82
  83 # Because of leftmost-then-longest match semantics, be sure to put the
  84 # longest operators first (e.g., if = came before ==, == would get
  85 # recognized as two instances of =).
  86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  87                  r"//=?", r"->",
  88                  r"[+\-*/%&@|^=<>]=?",
  89                  r"~")
  90
  91 Bracket = '[][(){}]'
  92 Special = group(r'\r?\n', r'[:;.,`@]')
  93 Funny = group(Operator, Bracket, Special)
  94
  95 PlainToken = group(Number, Funny, String, Name)
  96 Token = Ignore + PlainToken
  97
  98 # First (or only) line of ' or " string.
  99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 100                 group("'", r'\\\r?\n'),
 101                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 102                 group('"', r'\\\r?\n'))
 103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 105
 106 tokenprog = re.compile(Token, re.UNICODE)
 107 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 108 single3prog = re.compile(Single3)
 109 double3prog = re.compile(Double3)
 110 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 111             "'''": single3prog, '"""': double3prog,
 112             "r'''": single3prog, 'r"""': double3prog,
 113             "u'''": single3prog, 'u"""': double3prog,
 114             "b'''": single3prog, 'b"""': double3prog,
 115             "f'''": single3prog, 'f"""': double3prog,
 116             "ur'''": single3prog, 'ur"""': double3prog,
 117             "br'''": single3prog, 'br"""': double3prog,
 118             "rb'''": single3prog, 'rb"""': double3prog,
 119             "R'''": single3prog, 'R"""': double3prog,
 120             "U'''": single3prog, 'U"""': double3prog,
 121             "B'''": single3prog, 'B"""': double3prog,
 122             "F'''": single3prog, 'F"""': double3prog,
 123             "uR'''": single3prog, 'uR"""': double3prog,
 124             "Ur'''": single3prog, 'Ur"""': double3prog,
 125             "UR'''": single3prog, 'UR"""': double3prog,
 126             "bR'''": single3prog, 'bR"""': double3prog,
 127             "Br'''": single3prog, 'Br"""': double3prog,
 128             "BR'''": single3prog, 'BR"""': double3prog,
 129             "rB'''": single3prog, 'rB"""': double3prog,
 130             "Rb'''": single3prog, 'Rb"""': double3prog,
 131             "RB'''": single3prog, 'RB"""': double3prog,
 132             'r': None, 'R': None,
 133             'u': None, 'U': None,
 134             'f': None, 'F': None,
 135             'b': None, 'B': None}
 136
 137 triple_quoted = {}
 138 for t in ("'''", '"""',
 139           "r'''", 'r"""', "R'''", 'R"""',
 140           "u'''", 'u"""', "U'''", 'U"""',
 141           "b'''", 'b"""', "B'''", 'B"""',
 142           "f'''", 'f"""', "F'''", 'F"""',
 143           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 144           "uR'''", 'uR"""', "UR'''", 'UR"""',
 145           "br'''", 'br"""', "Br'''", 'Br"""',
 146           "bR'''", 'bR"""', "BR'''", 'BR"""',
 147           "rb'''", 'rb"""', "Rb'''", 'Rb"""',
 148           "rB'''", 'rB"""', "RB'''", 'RB"""',):
 149     triple_quoted[t] = t
 150 single_quoted = {}
 151 for t in ("'", '"',
 152           "r'", 'r"', "R'", 'R"',
 153           "u'", 'u"', "U'", 'U"',
 154           "b'", 'b"', "B'", 'B"',
 155           "f'", 'f"', "F'", 'F"',
 156           "ur'", 'ur"', "Ur'", 'Ur"',
 157           "uR'", 'uR"', "UR'", 'UR"',
 158           "br'", 'br"', "Br'", 'Br"',
 159           "bR'", 'bR"', "BR'", 'BR"',
 160           "rb'", 'rb"', "Rb'", 'Rb"',
 161           "rB'", 'rB"', "RB'", 'RB"',):
 162     single_quoted[t] = t
 163
 164 tabsize = 8
 165
 166 class TokenError(Exception): pass
 167
 168 class StopTokenizing(Exception): pass
 169
 170 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
 171     (srow, scol) = xxx_todo_changeme
 172     (erow, ecol) = xxx_todo_changeme1
 173     print("%d,%d-%d,%d:\t%s\t%s" % \
 174         (srow, scol, erow, ecol, tok_name[type], repr(token)))
 175
 176 def tokenize(readline, tokeneater=printtoken):
 177     """
 178     The tokenize() function accepts two parameters: one representing the
 179     input stream, and one providing an output mechanism for tokenize().
 180
 181     The first parameter, readline, must be a callable object which provides
 182     the same interface as the readline() method of built-in file objects.
 183     Each call to the function should return one line of input as a string.
 184
 185     The second parameter, tokeneater, must also be a callable object. It is
 186     called once for each token, with five arguments, corresponding to the
 187     tuples generated by generate_tokens().
 188     """
 189     try:
 190         tokenize_loop(readline, tokeneater)
 191     except StopTokenizing:
 192         pass
 193
 194 # backwards compatible interface
 195 def tokenize_loop(readline, tokeneater):
 196     for token_info in generate_tokens(readline):
 197         tokeneater(*token_info)
 198
 199 class Untokenizer:
 200
 201     def __init__(self):
 202         self.tokens = []
 203         self.prev_row = 1
 204         self.prev_col = 0
 205
 206     def add_whitespace(self, start):
 207         row, col = start
 208         assert row <= self.prev_row
 209         col_offset = col - self.prev_col
 210         if col_offset:
 211             self.tokens.append(" " * col_offset)
 212
 213     def untokenize(self, iterable):
 214         for t in iterable:
 215             if len(t) == 2:
 216                 self.compat(t, iterable)
 217                 break
 218             tok_type, token, start, end, line = t
 219             self.add_whitespace(start)
 220             self.tokens.append(token)
 221             self.prev_row, self.prev_col = end
 222             if tok_type in (NEWLINE, NL):
 223                 self.prev_row += 1
 224                 self.prev_col = 0
 225         return "".join(self.tokens)
 226
 227     def compat(self, token, iterable):
 228         startline = False
 229         indents = []
 230         toks_append = self.tokens.append
 231         toknum, tokval = token
 232         if toknum in (NAME, NUMBER):
 233             tokval += ' '
 234         if toknum in (NEWLINE, NL):
 235             startline = True
 236         for tok in iterable:
 237             toknum, tokval = tok[:2]
 238
 239             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 240                 tokval += ' '
 241
 242             if toknum == INDENT:
 243                 indents.append(tokval)
 244                 continue
 245             elif toknum == DEDENT:
 246                 indents.pop()
 247                 continue
 248             elif toknum in (NEWLINE, NL):
 249                 startline = True
 250             elif startline and indents:
 251                 toks_append(indents[-1])
 252                 startline = False
 253             toks_append(tokval)
 254
 255 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 256 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 257
 258 def _get_normal_name(orig_enc):
 259     """Imitates get_normal_name in tokenizer.c."""
 260     # Only care about the first 12 characters.
 261     enc = orig_enc[:12].lower().replace("_", "-")
 262     if enc == "utf-8" or enc.startswith("utf-8-"):
 263         return "utf-8"
 264     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 265        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 266         return "iso-8859-1"
 267     return orig_enc
 268
 269 def detect_encoding(readline):
 270     """
 271     The detect_encoding() function is used to detect the encoding that should
 272     be used to decode a Python source file. It requires one argument, readline,
 273     in the same way as the tokenize() generator.
 274
 275     It will call readline a maximum of twice, and return the encoding used
 276     (as a string) and a list of any lines (left as bytes) it has read
 277     in.
 278
 279     It detects the encoding from the presence of a utf-8 bom or an encoding
 280     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 281     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 282     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 283     'utf-8-sig' is returned.
 284
 285     If no encoding is specified, then the default of 'utf-8' will be returned.
 286     """
 287     bom_found = False
 288     encoding = None
 289     default = 'utf-8'
 290     def read_or_stop():
 291         try:
 292             return readline()
 293         except StopIteration:
 294             return bytes()
 295
 296     def find_cookie(line):
 297         try:
 298             line_string = line.decode('ascii')
 299         except UnicodeDecodeError:
 300             return None
 301         match = cookie_re.match(line_string)
 302         if not match:
 303             return None
 304         encoding = _get_normal_name(match.group(1))
 305         try:
 306             codec = lookup(encoding)
 307         except LookupError:
 308             # This behaviour mimics the Python interpreter
 309             raise SyntaxError("unknown encoding: " + encoding)
 310
 311         if bom_found:
 312             if codec.name != 'utf-8':
 313                 # This behaviour mimics the Python interpreter
 314                 raise SyntaxError('encoding problem: utf-8')
 315             encoding += '-sig'
 316         return encoding
 317
 318     first = read_or_stop()
 319     if first.startswith(BOM_UTF8):
 320         bom_found = True
 321         first = first[3:]
 322         default = 'utf-8-sig'
 323     if not first:
 324         return default, []
 325
 326     encoding = find_cookie(first)
 327     if encoding:
 328         return encoding, [first]
 329     if not blank_re.match(first):
 330         return default, [first]
 331
 332     second = read_or_stop()
 333     if not second:
 334         return default, [first]
 335
 336     encoding = find_cookie(second)
 337     if encoding:
 338         return encoding, [first, second]
 339
 340     return default, [first, second]
 341
 342 def untokenize(iterable):
 343     """Transform tokens back into Python source code.
 344
 345     Each element returned by the iterable must be a token sequence
 346     with at least two elements, a token number and token value.  If
 347     only two tokens are passed, the resulting output is poor.
 348
 349     Round-trip invariant for full input:
 350         Untokenized source will match input source exactly
 351
 352     Round-trip invariant for limited intput:
 353         # Output text will tokenize the back to the input
 354         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 355         newcode = untokenize(t1)
 356         readline = iter(newcode.splitlines(1)).next
 357         t2 = [tok[:2] for tokin generate_tokens(readline)]
 358         assert t1 == t2
 359     """
 360     ut = Untokenizer()
 361     return ut.untokenize(iterable)
 362
 363 InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
 364
 365 def generate_tokens(readline):
 366     """
 367     The generate_tokens() generator requires one argument, readline, which
 368     must be a callable object which provides the same interface as the
 369     readline() method of built-in file objects. Each call to the function
 370     should return one line of input as a string.  Alternately, readline
 371     can be a callable function terminating with StopIteration:
 372         readline = open(myfile).next    # Example of alternate readline
 373
 374     The generator produces 5-tuples with these members: the token type; the
 375     token string; a 2-tuple (srow, scol) of ints specifying the row and
 376     column where the token begins in the source; a 2-tuple (erow, ecol) of
 377     ints specifying the row and column where the token ends in the source;
 378     and the line on which the token was found. The line passed is the
 379     logical line; continuation lines are included.
 380     """
 381     lnum = parenlev = continued = 0
 382     namechars, numchars = string.ascii_letters + '_', '0123456789'
 383     contstr, needcont = '', 0
 384     contline = None
 385     indents = [0]
 386
 387     # 'stashed' and 'async_*' are used for async/await parsing
 388     stashed = None
 389     async_def = False
 390     async_def_indent = 0
 391     async_def_nl = False
 392
 393     while 1:                                   # loop over lines in stream
 394         try:
 395             line = readline()
 396         except StopIteration:
 397             line = ''
 398         lnum = lnum + 1
 399         pos, max = 0, len(line)
 400
 401         if contstr:                            # continued string
 402             if not line:
 403                 raise TokenError("EOF in multi-line string", strstart)
 404             endmatch = endprog.match(line)
 405             if endmatch:
 406                 pos = end = endmatch.end(0)
 407                 yield (STRING, contstr + line[:end],
 408                        strstart, (lnum, end), contline + line)
 409                 contstr, needcont = '', 0
 410                 contline = None
 411             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 412                 yield (ERRORTOKEN, contstr + line,
 413                            strstart, (lnum, len(line)), contline)
 414                 contstr = ''
 415                 contline = None
 416                 continue
 417             else:
 418                 contstr = contstr + line
 419                 contline = contline + line
 420                 continue
 421
 422         elif parenlev == 0 and not continued:  # new statement
 423             if not line: break
 424             column = 0
 425             while pos < max:                   # measure leading whitespace
 426                 if line[pos] == ' ': column = column + 1
 427                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
 428                 elif line[pos] == '\f': column = 0
 429                 else: break
 430                 pos = pos + 1
 431             if pos == max: break
 432
 433             if stashed:
 434                 yield stashed
 435                 stashed = None
 436
 437             if line[pos] in '\r\n':            # skip blank lines
 438                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 439                 continue
 440
 441             if column > indents[-1]:           # count indents
 442                 indents.append(column)
 443                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 444
 445             if line[pos] == '#':               # skip comments
 446                 comment_token = line[pos:].rstrip('\r\n')
 447                 nl_pos = pos + len(comment_token)
 448                 yield (COMMENT, comment_token,
 449                         (lnum, pos), (lnum, pos + len(comment_token)), line)
 450                 yield (NL, line[nl_pos:],
 451                         (lnum, nl_pos), (lnum, len(line)), line)
 452                 continue
 453
 454             while column < indents[-1]:        # count dedents
 455                 if column not in indents:
 456                     raise IndentationError(
 457                         "unindent does not match any outer indentation level",
 458                         ("<tokenize>", lnum, pos, line))
 459                 indents = indents[:-1]
 460
 461                 if async_def and async_def_indent >= indents[-1]:
 462                     async_def = False
 463                     async_def_nl = False
 464                     async_def_indent = 0
 465
 466                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 467
 468             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 469                 async_def = False
 470                 async_def_nl = False
 471                 async_def_indent = 0
 472
 473         else:                                  # continued statement
 474             if not line:
 475                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 476             continued = 0
 477
 478         while pos < max:
 479             pseudomatch = pseudoprog.match(line, pos)
 480             if not pseudomatch:
 481                 print('no pseudomatch')
 482             if pseudomatch:                                # scan for tokens
 483                 start, end = pseudomatch.span(1)
 484                 spos, epos, pos = (lnum, start), (lnum, end), end
 485                 token, initial = line[start:end], line[start]
 486
 487                 if initial in numchars or \
 488                    (initial == '.' and token != '.'):      # ordinary number
 489                     yield (NUMBER, token, spos, epos, line)
 490                 elif initial in '\r\n':
 491                     newline = NEWLINE
 492                     if parenlev > 0:
 493                         newline = NL
 494                     elif async_def:
 495                         async_def_nl = True
 496                     if stashed:
 497                         yield stashed
 498                         stashed = None
 499                     yield (newline, token, spos, epos, line)
 500
 501                 elif initial == '#':
 502                     assert not token.endswith("\n")
 503                     if stashed:
 504                         yield stashed
 505                         stashed = None
 506                     yield (COMMENT, token, spos, epos, line)
 507                 elif token in triple_quoted:
 508                     endprog = endprogs[token]
 509                     endmatch = endprog.match(line, pos)
 510                     if endmatch:                           # all on one line
 511                         pos = endmatch.end(0)
 512                         token = line[start:pos]
 513                         if stashed:
 514                             yield stashed
 515                             stashed = None
 516                         yield (STRING, token, spos, (lnum, pos), line)
 517                     else:
 518                         strstart = (lnum, start)           # multiple lines
 519                         contstr = line[start:]
 520                         contline = line
 521                         break
 522                 elif initial in single_quoted or \
 523                     token[:2] in single_quoted or \
 524                     token[:3] in single_quoted:
 525                     if token[-1] == '\n':                  # continued string
 526                         strstart = (lnum, start)
 527                         endprog = (endprogs[initial] or endprogs[token[1]] or
 528                                    endprogs[token[2]])
 529                         contstr, needcont = line[start:], 1
 530                         contline = line
 531                         break
 532                     else:                                  # ordinary string
 533                         if stashed:
 534                             yield stashed
 535                             stashed = None
 536                         yield (STRING, token, spos, epos, line)
 537                 elif (initial in namechars or              # ordinary name
 538                       unicodedata.category(initial) in InitialCategories):
 539                     if token in ('async', 'await'):
 540                         if async_def:
 541                             yield (ASYNC if token == 'async' else AWAIT,
 542                                    token, spos, epos, line)
 543                             continue
 544
 545                     tok = (NAME, token, spos, epos, line)
 546                     if token == 'async' and not stashed:
 547                         stashed = tok
 548                         continue
 549
 550                     if token == 'def':
 551                         if (stashed
 552                                 and stashed[0] == NAME
 553                                 and stashed[1] == 'async'):
 554
 555                             async_def = True
 556                             async_def_indent = indents[-1]
 557
 558                             yield (ASYNC, stashed[1],
 559                                    stashed[2], stashed[3],
 560                                    stashed[4])
 561                             stashed = None
 562
 563                     if stashed:
 564                         yield stashed
 565                         stashed = None
 566
 567                     yield tok
 568                 elif initial == '\\':                      # continued stmt
 569                     # This yield is new; needed for better idempotency:
 570                     if stashed:
 571                         yield stashed
 572                         stashed = None
 573                     yield (NL, token, spos, (lnum, pos), line)
 574                     continued = 1
 575                 else:
 576                     if initial in '([{': parenlev = parenlev + 1
 577                     elif initial in ')]}': parenlev = parenlev - 1
 578                     if stashed:
 579                         yield stashed
 580                         stashed = None
 581                     yield (OP, token, spos, epos, line)
 582             else:
 583                 yield (ERRORTOKEN, line[pos],
 584                            (lnum, pos), (lnum, pos+1), line)
 585                 pos = pos + 1
 586
 587     if stashed:
 588         yield stashed
 589         stashed = None
 590
 591     for indent in indents[1:]:                 # pop remaining indent levels
 592         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 593     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 594
 595 if __name__ == '__main__':                     # testing
 596     import sys
 597     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 598     else: tokenize(sys.stdin.readline)