blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 """Tokenization help for Python programs.
   5
   6 generate_tokens(readline) is a generator that breaks a stream of
   7 text into Python tokens.  It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or "" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators
  20
  21 Older entry points
  22     tokenize_loop(readline, tokeneater)
  23     tokenize(readline, tokeneater=printtoken)
  24 are the same, except instead of generating tokens, tokeneater is a callback
  25 function to which the 5 fields described above are passed as 5 arguments,
  26 each time a new token is found."""
  27
  28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  29 __credits__ = \
  30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  31
  32 import string, re, unicodedata
  33 from codecs import BOM_UTF8, lookup
  34 from blib2to3.pgen2.token import *
  35
  36 from . import token
  37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
  38            "generate_tokens", "untokenize"]
  39 del token
  40
  41 try:
  42     bytes
  43 except NameError:
  44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  45     # valid Python 3 code.
  46     bytes = str
  47
  48 def group(*choices): return '(' + '|'.join(choices) + ')'
  49 def any(*choices): return group(*choices) + '*'
  50 def maybe(*choices): return group(*choices) + '?'
  51 def _combinations(*l):
  52     return set(
  53         x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
  54     )
  55
  56 Whitespace = r'[ \f\t]*'
  57 Comment = r'#[^\r\n]*'
  58 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  59 Name = r'[^\d\W]\w*'
  60
  61 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  62 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  63 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
  64 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
  65 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  66 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
  67 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
  68 Expfloat = r'\d+(?:_\d+)*' + Exponent
  69 Floatnumber = group(Pointfloat, Expfloat)
  70 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
  71 Number = group(Imagnumber, Floatnumber, Intnumber)
  72
  73 # Tail end of ' string.
  74 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  75 # Tail end of " string.
  76 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  77 # Tail end of ''' string.
  78 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  79 # Tail end of """ string.
  80 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  81 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  82 Triple = group(_litprefix + "'''", _litprefix + '"""')
  83 # Single-line ' or " string.
  84 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  85                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  86
  87 # Because of leftmost-then-longest match semantics, be sure to put the
  88 # longest operators first (e.g., if = came before ==, == would get
  89 # recognized as two instances of =).
  90 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  91                  r"//=?", r"->",
  92                  r"[+\-*/%&@|^=<>]=?",
  93                  r"~")
  94
  95 Bracket = '[][(){}]'
  96 Special = group(r'\r?\n', r'[:;.,`@]')
  97 Funny = group(Operator, Bracket, Special)
  98
  99 PlainToken = group(Number, Funny, String, Name)
 100 Token = Ignore + PlainToken
 101
 102 # First (or only) line of ' or " string.
 103 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 104                 group("'", r'\\\r?\n'),
 105                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 106                 group('"', r'\\\r?\n'))
 107 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 108 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 109
 110 tokenprog = re.compile(Token, re.UNICODE)
 111 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 112 single3prog = re.compile(Single3)
 113 double3prog = re.compile(Double3)
 114
 115 _strprefixes = (
 116     _combinations('r', 'R', 'f', 'F') |
 117     _combinations('r', 'R', 'b', 'B') |
 118     {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
 119 )
 120
 121 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 122             "'''": single3prog, '"""': double3prog,
 123             **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 124             **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 125             **{prefix: None for prefix in _strprefixes}}
 126
 127 triple_quoted = (
 128     {"'''", '"""'} |
 129     {f"{prefix}'''" for prefix in _strprefixes} |
 130     {f'{prefix}"""' for prefix in _strprefixes}
 131 )
 132 single_quoted = (
 133     {"'", '"'} |
 134     {f"{prefix}'" for prefix in _strprefixes} |
 135     {f'{prefix}"' for prefix in _strprefixes}
 136 )
 137
 138 tabsize = 8
 139
 140 class TokenError(Exception): pass
 141
 142 class StopTokenizing(Exception): pass
 143
 144 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
 145     (srow, scol) = xxx_todo_changeme
 146     (erow, ecol) = xxx_todo_changeme1
 147     print("%d,%d-%d,%d:\t%s\t%s" % \
 148         (srow, scol, erow, ecol, tok_name[type], repr(token)))
 149
 150 def tokenize(readline, tokeneater=printtoken):
 151     """
 152     The tokenize() function accepts two parameters: one representing the
 153     input stream, and one providing an output mechanism for tokenize().
 154
 155     The first parameter, readline, must be a callable object which provides
 156     the same interface as the readline() method of built-in file objects.
 157     Each call to the function should return one line of input as a string.
 158
 159     The second parameter, tokeneater, must also be a callable object. It is
 160     called once for each token, with five arguments, corresponding to the
 161     tuples generated by generate_tokens().
 162     """
 163     try:
 164         tokenize_loop(readline, tokeneater)
 165     except StopTokenizing:
 166         pass
 167
 168 # backwards compatible interface
 169 def tokenize_loop(readline, tokeneater):
 170     for token_info in generate_tokens(readline):
 171         tokeneater(*token_info)
 172
 173 class Untokenizer:
 174
 175     def __init__(self):
 176         self.tokens = []
 177         self.prev_row = 1
 178         self.prev_col = 0
 179
 180     def add_whitespace(self, start):
 181         row, col = start
 182         assert row <= self.prev_row
 183         col_offset = col - self.prev_col
 184         if col_offset:
 185             self.tokens.append(" " * col_offset)
 186
 187     def untokenize(self, iterable):
 188         for t in iterable:
 189             if len(t) == 2:
 190                 self.compat(t, iterable)
 191                 break
 192             tok_type, token, start, end, line = t
 193             self.add_whitespace(start)
 194             self.tokens.append(token)
 195             self.prev_row, self.prev_col = end
 196             if tok_type in (NEWLINE, NL):
 197                 self.prev_row += 1
 198                 self.prev_col = 0
 199         return "".join(self.tokens)
 200
 201     def compat(self, token, iterable):
 202         startline = False
 203         indents = []
 204         toks_append = self.tokens.append
 205         toknum, tokval = token
 206         if toknum in (NAME, NUMBER):
 207             tokval += ' '
 208         if toknum in (NEWLINE, NL):
 209             startline = True
 210         for tok in iterable:
 211             toknum, tokval = tok[:2]
 212
 213             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 214                 tokval += ' '
 215
 216             if toknum == INDENT:
 217                 indents.append(tokval)
 218                 continue
 219             elif toknum == DEDENT:
 220                 indents.pop()
 221                 continue
 222             elif toknum in (NEWLINE, NL):
 223                 startline = True
 224             elif startline and indents:
 225                 toks_append(indents[-1])
 226                 startline = False
 227             toks_append(tokval)
 228
 229 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 230 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 231
 232 def _get_normal_name(orig_enc):
 233     """Imitates get_normal_name in tokenizer.c."""
 234     # Only care about the first 12 characters.
 235     enc = orig_enc[:12].lower().replace("_", "-")
 236     if enc == "utf-8" or enc.startswith("utf-8-"):
 237         return "utf-8"
 238     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 239        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 240         return "iso-8859-1"
 241     return orig_enc
 242
 243 def detect_encoding(readline):
 244     """
 245     The detect_encoding() function is used to detect the encoding that should
 246     be used to decode a Python source file. It requires one argument, readline,
 247     in the same way as the tokenize() generator.
 248
 249     It will call readline a maximum of twice, and return the encoding used
 250     (as a string) and a list of any lines (left as bytes) it has read
 251     in.
 252
 253     It detects the encoding from the presence of a utf-8 bom or an encoding
 254     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 255     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 256     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 257     'utf-8-sig' is returned.
 258
 259     If no encoding is specified, then the default of 'utf-8' will be returned.
 260     """
 261     bom_found = False
 262     encoding = None
 263     default = 'utf-8'
 264     def read_or_stop():
 265         try:
 266             return readline()
 267         except StopIteration:
 268             return bytes()
 269
 270     def find_cookie(line):
 271         try:
 272             line_string = line.decode('ascii')
 273         except UnicodeDecodeError:
 274             return None
 275         match = cookie_re.match(line_string)
 276         if not match:
 277             return None
 278         encoding = _get_normal_name(match.group(1))
 279         try:
 280             codec = lookup(encoding)
 281         except LookupError:
 282             # This behaviour mimics the Python interpreter
 283             raise SyntaxError("unknown encoding: " + encoding)
 284
 285         if bom_found:
 286             if codec.name != 'utf-8':
 287                 # This behaviour mimics the Python interpreter
 288                 raise SyntaxError('encoding problem: utf-8')
 289             encoding += '-sig'
 290         return encoding
 291
 292     first = read_or_stop()
 293     if first.startswith(BOM_UTF8):
 294         bom_found = True
 295         first = first[3:]
 296         default = 'utf-8-sig'
 297     if not first:
 298         return default, []
 299
 300     encoding = find_cookie(first)
 301     if encoding:
 302         return encoding, [first]
 303     if not blank_re.match(first):
 304         return default, [first]
 305
 306     second = read_or_stop()
 307     if not second:
 308         return default, [first]
 309
 310     encoding = find_cookie(second)
 311     if encoding:
 312         return encoding, [first, second]
 313
 314     return default, [first, second]
 315
 316 def untokenize(iterable):
 317     """Transform tokens back into Python source code.
 318
 319     Each element returned by the iterable must be a token sequence
 320     with at least two elements, a token number and token value.  If
 321     only two tokens are passed, the resulting output is poor.
 322
 323     Round-trip invariant for full input:
 324         Untokenized source will match input source exactly
 325
 326     Round-trip invariant for limited intput:
 327         # Output text will tokenize the back to the input
 328         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 329         newcode = untokenize(t1)
 330         readline = iter(newcode.splitlines(1)).next
 331         t2 = [tok[:2] for tokin generate_tokens(readline)]
 332         assert t1 == t2
 333     """
 334     ut = Untokenizer()
 335     return ut.untokenize(iterable)
 336
 337 InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
 338
 339 def generate_tokens(readline):
 340     """
 341     The generate_tokens() generator requires one argument, readline, which
 342     must be a callable object which provides the same interface as the
 343     readline() method of built-in file objects. Each call to the function
 344     should return one line of input as a string.  Alternately, readline
 345     can be a callable function terminating with StopIteration:
 346         readline = open(myfile).next    # Example of alternate readline
 347
 348     The generator produces 5-tuples with these members: the token type; the
 349     token string; a 2-tuple (srow, scol) of ints specifying the row and
 350     column where the token begins in the source; a 2-tuple (erow, ecol) of
 351     ints specifying the row and column where the token ends in the source;
 352     and the line on which the token was found. The line passed is the
 353     logical line; continuation lines are included.
 354     """
 355     lnum = parenlev = continued = 0
 356     namechars, numchars = string.ascii_letters + '_', '0123456789'
 357     contstr, needcont = '', 0
 358     contline = None
 359     indents = [0]
 360
 361     # 'stashed' and 'async_*' are used for async/await parsing
 362     stashed = None
 363     async_def = False
 364     async_def_indent = 0
 365     async_def_nl = False
 366
 367     while 1:                                   # loop over lines in stream
 368         try:
 369             line = readline()
 370         except StopIteration:
 371             line = ''
 372         lnum = lnum + 1
 373         pos, max = 0, len(line)
 374
 375         if contstr:                            # continued string
 376             if not line:
 377                 raise TokenError("EOF in multi-line string", strstart)
 378             endmatch = endprog.match(line)
 379             if endmatch:
 380                 pos = end = endmatch.end(0)
 381                 yield (STRING, contstr + line[:end],
 382                        strstart, (lnum, end), contline + line)
 383                 contstr, needcont = '', 0
 384                 contline = None
 385             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 386                 yield (ERRORTOKEN, contstr + line,
 387                            strstart, (lnum, len(line)), contline)
 388                 contstr = ''
 389                 contline = None
 390                 continue
 391             else:
 392                 contstr = contstr + line
 393                 contline = contline + line
 394                 continue
 395
 396         elif parenlev == 0 and not continued:  # new statement
 397             if not line: break
 398             column = 0
 399             while pos < max:                   # measure leading whitespace
 400                 if line[pos] == ' ': column = column + 1
 401                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
 402                 elif line[pos] == '\f': column = 0
 403                 else: break
 404                 pos = pos + 1
 405             if pos == max: break
 406
 407             if stashed:
 408                 yield stashed
 409                 stashed = None
 410
 411             if line[pos] in '\r\n':            # skip blank lines
 412                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 413                 continue
 414
 415             if column > indents[-1]:           # count indents
 416                 indents.append(column)
 417                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 418
 419             if line[pos] == '#':               # skip comments
 420                 comment_token = line[pos:].rstrip('\r\n')
 421                 nl_pos = pos + len(comment_token)
 422                 yield (COMMENT, comment_token,
 423                         (lnum, pos), (lnum, pos + len(comment_token)), line)
 424                 yield (NL, line[nl_pos:],
 425                         (lnum, nl_pos), (lnum, len(line)), line)
 426                 continue
 427
 428             while column < indents[-1]:        # count dedents
 429                 if column not in indents:
 430                     raise IndentationError(
 431                         "unindent does not match any outer indentation level",
 432                         ("<tokenize>", lnum, pos, line))
 433                 indents = indents[:-1]
 434
 435                 if async_def and async_def_indent >= indents[-1]:
 436                     async_def = False
 437                     async_def_nl = False
 438                     async_def_indent = 0
 439
 440                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 441
 442             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 443                 async_def = False
 444                 async_def_nl = False
 445                 async_def_indent = 0
 446
 447         else:                                  # continued statement
 448             if not line:
 449                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 450             continued = 0
 451
 452         while pos < max:
 453             pseudomatch = pseudoprog.match(line, pos)
 454             if not pseudomatch:
 455                 print('no pseudomatch')
 456             if pseudomatch:                                # scan for tokens
 457                 start, end = pseudomatch.span(1)
 458                 spos, epos, pos = (lnum, start), (lnum, end), end
 459                 token, initial = line[start:end], line[start]
 460
 461                 if initial in numchars or \
 462                    (initial == '.' and token != '.'):      # ordinary number
 463                     yield (NUMBER, token, spos, epos, line)
 464                 elif initial in '\r\n':
 465                     newline = NEWLINE
 466                     if parenlev > 0:
 467                         newline = NL
 468                     elif async_def:
 469                         async_def_nl = True
 470                     if stashed:
 471                         yield stashed
 472                         stashed = None
 473                     yield (newline, token, spos, epos, line)
 474
 475                 elif initial == '#':
 476                     assert not token.endswith("\n")
 477                     if stashed:
 478                         yield stashed
 479                         stashed = None
 480                     yield (COMMENT, token, spos, epos, line)
 481                 elif token in triple_quoted:
 482                     endprog = endprogs[token]
 483                     endmatch = endprog.match(line, pos)
 484                     if endmatch:                           # all on one line
 485                         pos = endmatch.end(0)
 486                         token = line[start:pos]
 487                         if stashed:
 488                             yield stashed
 489                             stashed = None
 490                         yield (STRING, token, spos, (lnum, pos), line)
 491                     else:
 492                         strstart = (lnum, start)           # multiple lines
 493                         contstr = line[start:]
 494                         contline = line
 495                         break
 496                 elif initial in single_quoted or \
 497                     token[:2] in single_quoted or \
 498                     token[:3] in single_quoted:
 499                     if token[-1] == '\n':                  # continued string
 500                         strstart = (lnum, start)
 501                         endprog = (endprogs[initial] or endprogs[token[1]] or
 502                                    endprogs[token[2]])
 503                         contstr, needcont = line[start:], 1
 504                         contline = line
 505                         break
 506                     else:                                  # ordinary string
 507                         if stashed:
 508                             yield stashed
 509                             stashed = None
 510                         yield (STRING, token, spos, epos, line)
 511                 elif (initial in namechars or              # ordinary name
 512                       unicodedata.category(initial) in InitialCategories):
 513                     if token in ('async', 'await'):
 514                         if async_def:
 515                             yield (ASYNC if token == 'async' else AWAIT,
 516                                    token, spos, epos, line)
 517                             continue
 518
 519                     tok = (NAME, token, spos, epos, line)
 520                     if token == 'async' and not stashed:
 521                         stashed = tok
 522                         continue
 523
 524                     if token == 'def':
 525                         if (stashed
 526                                 and stashed[0] == NAME
 527                                 and stashed[1] == 'async'):
 528
 529                             async_def = True
 530                             async_def_indent = indents[-1]
 531
 532                             yield (ASYNC, stashed[1],
 533                                    stashed[2], stashed[3],
 534                                    stashed[4])
 535                             stashed = None
 536
 537                     if stashed:
 538                         yield stashed
 539                         stashed = None
 540
 541                     yield tok
 542                 elif initial == '\\':                      # continued stmt
 543                     # This yield is new; needed for better idempotency:
 544                     if stashed:
 545                         yield stashed
 546                         stashed = None
 547                     yield (NL, token, spos, (lnum, pos), line)
 548                     continued = 1
 549                 else:
 550                     if initial in '([{': parenlev = parenlev + 1
 551                     elif initial in ')]}': parenlev = parenlev - 1
 552                     if stashed:
 553                         yield stashed
 554                         stashed = None
 555                     yield (OP, token, spos, epos, line)
 556             else:
 557                 yield (ERRORTOKEN, line[pos],
 558                            (lnum, pos), (lnum, pos+1), line)
 559                 pos = pos + 1
 560
 561     if stashed:
 562         yield stashed
 563         stashed = None
 564
 565     for indent in indents[1:]:                 # pop remaining indent levels
 566         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 567     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 568
 569 if __name__ == '__main__':                     # testing
 570     import sys
 571     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 572     else: tokenize(sys.stdin.readline)