src/blib2to3/pgen2/tokenize.py

   1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   2 # All rights reserved.
   3
   4 # mypy: allow-untyped-defs, allow-untyped-calls
   5
   6 """Tokenization help for Python programs.
   7
   8 generate_tokens(readline) is a generator that breaks a stream of
   9 text into Python tokens.  It accepts a readline-like method which is called
  10 repeatedly to get the next line of input (or "" for EOF).  It generates
  11 5-tuples with these members:
  12
  13     the token type (see token.py)
  14     the token (a string)
  15     the starting (row, column) indices of the token (a 2-tuple of ints)
  16     the ending (row, column) indices of the token (a 2-tuple of ints)
  17     the original line (string)
  18
  19 It is designed to match the working of the Python tokenizer exactly, except
  20 that it produces COMMENT tokens for comments and gives type OP for all
  21 operators
  22
  23 Older entry points
  24     tokenize_loop(readline, tokeneater)
  25     tokenize(readline, tokeneater=printtoken)
  26 are the same, except instead of generating tokens, tokeneater is a callback
  27 function to which the 5 fields described above are passed as 5 arguments,
  28 each time a new token is found."""
  29
  30 import sys
  31 from typing import (
  32     Callable,
  33     Final,
  34     Iterable,
  35     Iterator,
  36     List,
  37     Optional,
  38     Pattern,
  39     Set,
  40     Tuple,
  41     Union,
  42     cast,
  43 )
  44
  45 from blib2to3.pgen2.grammar import Grammar
  46 from blib2to3.pgen2.token import (
  47     ASYNC,
  48     AWAIT,
  49     COMMENT,
  50     DEDENT,
  51     ENDMARKER,
  52     ERRORTOKEN,
  53     INDENT,
  54     NAME,
  55     NEWLINE,
  56     NL,
  57     NUMBER,
  58     OP,
  59     STRING,
  60     tok_name,
  61 )
  62
  63 __author__ = "Ka-Ping Yee <ping@lfw.org>"
  64 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  65
  66 import re
  67 from codecs import BOM_UTF8, lookup
  68
  69 from . import token
  70
  71 __all__ = [x for x in dir(token) if x[0] != "_"] + [
  72     "tokenize",
  73     "generate_tokens",
  74     "untokenize",
  75 ]
  76 del token
  77
  78
  79 def group(*choices: str) -> str:
  80     return "(" + "|".join(choices) + ")"
  81
  82
  83 def any(*choices: str) -> str:
  84     return group(*choices) + "*"
  85
  86
  87 def maybe(*choices: str) -> str:
  88     return group(*choices) + "?"
  89
  90
  91 def _combinations(*l: str) -> Set[str]:
  92     return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
  93
  94
  95 Whitespace = r"[ \f\t]*"
  96 Comment = r"#[^\r\n]*"
  97 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
  98 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
  99     r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
 100 )
 101
 102 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
 103 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
 104 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
 105 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
 106 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
 107 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
 108 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
 109     Exponent
 110 )
 111 Expfloat = r"\d+(?:_\d+)*" + Exponent
 112 Floatnumber = group(Pointfloat, Expfloat)
 113 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
 114 Number = group(Imagnumber, Floatnumber, Intnumber)
 115
 116 # Tail end of ' string.
 117 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 118 # Tail end of " string.
 119 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 120 # Tail end of ''' string.
 121 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 122 # Tail end of """ string.
 123 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 124 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 125 Triple = group(_litprefix + "'''", _litprefix + '"""')
 126 # Single-line ' or " string.
 127 String = group(
 128     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 129     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
 130 )
 131
 132 # Because of leftmost-then-longest match semantics, be sure to put the
 133 # longest operators first (e.g., if = came before ==, == would get
 134 # recognized as two instances of =).
 135 Operator = group(
 136     r"\*\*=?",
 137     r">>=?",
 138     r"<<=?",
 139     r"<>",
 140     r"!=",
 141     r"//=?",
 142     r"->",
 143     r"[+\-*/%&@|^=<>:]=?",
 144     r"~",
 145 )
 146
 147 Bracket = "[][(){}]"
 148 Special = group(r"\r?\n", r"[:;.,`@]")
 149 Funny = group(Operator, Bracket, Special)
 150
 151 # First (or only) line of ' or " string.
 152 ContStr = group(
 153     _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
 154     _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
 155 )
 156 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 157 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 158
 159 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
 160 single3prog = re.compile(Single3)
 161 double3prog = re.compile(Double3)
 162
 163 _strprefixes = (
 164     _combinations("r", "R", "f", "F")
 165     | _combinations("r", "R", "b", "B")
 166     | {"u", "U", "ur", "uR", "Ur", "UR"}
 167 )
 168
 169 endprogs: Final = {
 170     "'": re.compile(Single),
 171     '"': re.compile(Double),
 172     "'''": single3prog,
 173     '"""': double3prog,
 174     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
 175     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
 176 }
 177
 178 triple_quoted: Final = (
 179     {"'''", '"""'}
 180     | {f"{prefix}'''" for prefix in _strprefixes}
 181     | {f'{prefix}"""' for prefix in _strprefixes}
 182 )
 183 single_quoted: Final = (
 184     {"'", '"'}
 185     | {f"{prefix}'" for prefix in _strprefixes}
 186     | {f'{prefix}"' for prefix in _strprefixes}
 187 )
 188
 189 tabsize = 8
 190
 191
 192 class TokenError(Exception):
 193     pass
 194
 195
 196 class StopTokenizing(Exception):
 197     pass
 198
 199
 200 Coord = Tuple[int, int]
 201
 202
 203 def printtoken(
 204     type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
 205 ) -> None:  # for testing
 206     (srow, scol) = srow_col
 207     (erow, ecol) = erow_col
 208     print(
 209         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
 210     )
 211
 212
 213 TokenEater = Callable[[int, str, Coord, Coord, str], None]
 214
 215
 216 def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
 217     """
 218     The tokenize() function accepts two parameters: one representing the
 219     input stream, and one providing an output mechanism for tokenize().
 220
 221     The first parameter, readline, must be a callable object which provides
 222     the same interface as the readline() method of built-in file objects.
 223     Each call to the function should return one line of input as a string.
 224
 225     The second parameter, tokeneater, must also be a callable object. It is
 226     called once for each token, with five arguments, corresponding to the
 227     tuples generated by generate_tokens().
 228     """
 229     try:
 230         tokenize_loop(readline, tokeneater)
 231     except StopTokenizing:
 232         pass
 233
 234
 235 # backwards compatible interface
 236 def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
 237     for token_info in generate_tokens(readline):
 238         tokeneater(*token_info)
 239
 240
 241 GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
 242 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 243
 244
 245 class Untokenizer:
 246     tokens: List[str]
 247     prev_row: int
 248     prev_col: int
 249
 250     def __init__(self) -> None:
 251         self.tokens = []
 252         self.prev_row = 1
 253         self.prev_col = 0
 254
 255     def add_whitespace(self, start: Coord) -> None:
 256         row, col = start
 257         assert row <= self.prev_row
 258         col_offset = col - self.prev_col
 259         if col_offset:
 260             self.tokens.append(" " * col_offset)
 261
 262     def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
 263         for t in iterable:
 264             if len(t) == 2:
 265                 self.compat(cast(Tuple[int, str], t), iterable)
 266                 break
 267             tok_type, token, start, end, line = cast(
 268                 Tuple[int, str, Coord, Coord, str], t
 269             )
 270             self.add_whitespace(start)
 271             self.tokens.append(token)
 272             self.prev_row, self.prev_col = end
 273             if tok_type in (NEWLINE, NL):
 274                 self.prev_row += 1
 275                 self.prev_col = 0
 276         return "".join(self.tokens)
 277
 278     def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
 279         startline = False
 280         indents = []
 281         toks_append = self.tokens.append
 282         toknum, tokval = token
 283         if toknum in (NAME, NUMBER):
 284             tokval += " "
 285         if toknum in (NEWLINE, NL):
 286             startline = True
 287         for tok in iterable:
 288             toknum, tokval = tok[:2]
 289
 290             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 291                 tokval += " "
 292
 293             if toknum == INDENT:
 294                 indents.append(tokval)
 295                 continue
 296             elif toknum == DEDENT:
 297                 indents.pop()
 298                 continue
 299             elif toknum in (NEWLINE, NL):
 300                 startline = True
 301             elif startline and indents:
 302                 toks_append(indents[-1])
 303                 startline = False
 304             toks_append(tokval)
 305
 306
 307 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
 308 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 309
 310
 311 def _get_normal_name(orig_enc: str) -> str:
 312     """Imitates get_normal_name in tokenizer.c."""
 313     # Only care about the first 12 characters.
 314     enc = orig_enc[:12].lower().replace("_", "-")
 315     if enc == "utf-8" or enc.startswith("utf-8-"):
 316         return "utf-8"
 317     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
 318         ("latin-1-", "iso-8859-1-", "iso-latin-1-")
 319     ):
 320         return "iso-8859-1"
 321     return orig_enc
 322
 323
 324 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
 325     """
 326     The detect_encoding() function is used to detect the encoding that should
 327     be used to decode a Python source file. It requires one argument, readline,
 328     in the same way as the tokenize() generator.
 329
 330     It will call readline a maximum of twice, and return the encoding used
 331     (as a string) and a list of any lines (left as bytes) it has read
 332     in.
 333
 334     It detects the encoding from the presence of a utf-8 bom or an encoding
 335     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 336     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 337     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 338     'utf-8-sig' is returned.
 339
 340     If no encoding is specified, then the default of 'utf-8' will be returned.
 341     """
 342     bom_found = False
 343     encoding = None
 344     default = "utf-8"
 345
 346     def read_or_stop() -> bytes:
 347         try:
 348             return readline()
 349         except StopIteration:
 350             return b""
 351
 352     def find_cookie(line: bytes) -> Optional[str]:
 353         try:
 354             line_string = line.decode("ascii")
 355         except UnicodeDecodeError:
 356             return None
 357         match = cookie_re.match(line_string)
 358         if not match:
 359             return None
 360         encoding = _get_normal_name(match.group(1))
 361         try:
 362             codec = lookup(encoding)
 363         except LookupError:
 364             # This behaviour mimics the Python interpreter
 365             raise SyntaxError("unknown encoding: " + encoding)
 366
 367         if bom_found:
 368             if codec.name != "utf-8":
 369                 # This behaviour mimics the Python interpreter
 370                 raise SyntaxError("encoding problem: utf-8")
 371             encoding += "-sig"
 372         return encoding
 373
 374     first = read_or_stop()
 375     if first.startswith(BOM_UTF8):
 376         bom_found = True
 377         first = first[3:]
 378         default = "utf-8-sig"
 379     if not first:
 380         return default, []
 381
 382     encoding = find_cookie(first)
 383     if encoding:
 384         return encoding, [first]
 385     if not blank_re.match(first):
 386         return default, [first]
 387
 388     second = read_or_stop()
 389     if not second:
 390         return default, [first]
 391
 392     encoding = find_cookie(second)
 393     if encoding:
 394         return encoding, [first, second]
 395
 396     return default, [first, second]
 397
 398
 399 def untokenize(iterable: Iterable[TokenInfo]) -> str:
 400     """Transform tokens back into Python source code.
 401
 402     Each element returned by the iterable must be a token sequence
 403     with at least two elements, a token number and token value.  If
 404     only two tokens are passed, the resulting output is poor.
 405
 406     Round-trip invariant for full input:
 407         Untokenized source will match input source exactly
 408
 409     Round-trip invariant for limited input:
 410         # Output text will tokenize the back to the input
 411         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 412         newcode = untokenize(t1)
 413         readline = iter(newcode.splitlines(1)).next
 414         t2 = [tok[:2] for tokin generate_tokens(readline)]
 415         assert t1 == t2
 416     """
 417     ut = Untokenizer()
 418     return ut.untokenize(iterable)
 419
 420
 421 def generate_tokens(
 422     readline: Callable[[], str], grammar: Optional[Grammar] = None
 423 ) -> Iterator[GoodTokenInfo]:
 424     """
 425     The generate_tokens() generator requires one argument, readline, which
 426     must be a callable object which provides the same interface as the
 427     readline() method of built-in file objects. Each call to the function
 428     should return one line of input as a string.  Alternately, readline
 429     can be a callable function terminating with StopIteration:
 430         readline = open(myfile).next    # Example of alternate readline
 431
 432     The generator produces 5-tuples with these members: the token type; the
 433     token string; a 2-tuple (srow, scol) of ints specifying the row and
 434     column where the token begins in the source; a 2-tuple (erow, ecol) of
 435     ints specifying the row and column where the token ends in the source;
 436     and the line on which the token was found. The line passed is the
 437     logical line; continuation lines are included.
 438     """
 439     lnum = parenlev = continued = 0
 440     numchars: Final[str] = "0123456789"
 441     contstr, needcont = "", 0
 442     contline: Optional[str] = None
 443     indents = [0]
 444
 445     # If we know we're parsing 3.7+, we can unconditionally parse `async` and
 446     # `await` as keywords.
 447     async_keywords = False if grammar is None else grammar.async_keywords
 448     # 'stashed' and 'async_*' are used for async/await parsing
 449     stashed: Optional[GoodTokenInfo] = None
 450     async_def = False
 451     async_def_indent = 0
 452     async_def_nl = False
 453
 454     strstart: Tuple[int, int]
 455     endprog: Pattern[str]
 456
 457     while 1:  # loop over lines in stream
 458         try:
 459             line = readline()
 460         except StopIteration:
 461             line = ""
 462         lnum += 1
 463         pos, max = 0, len(line)
 464
 465         if contstr:  # continued string
 466             assert contline is not None
 467             if not line:
 468                 raise TokenError("EOF in multi-line string", strstart)
 469             endmatch = endprog.match(line)
 470             if endmatch:
 471                 pos = end = endmatch.end(0)
 472                 yield (
 473                     STRING,
 474                     contstr + line[:end],
 475                     strstart,
 476                     (lnum, end),
 477                     contline + line,
 478                 )
 479                 contstr, needcont = "", 0
 480                 contline = None
 481             elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
 482                 yield (
 483                     ERRORTOKEN,
 484                     contstr + line,
 485                     strstart,
 486                     (lnum, len(line)),
 487                     contline,
 488                 )
 489                 contstr = ""
 490                 contline = None
 491                 continue
 492             else:
 493                 contstr = contstr + line
 494                 contline = contline + line
 495                 continue
 496
 497         elif parenlev == 0 and not continued:  # new statement
 498             if not line:
 499                 break
 500             column = 0
 501             while pos < max:  # measure leading whitespace
 502                 if line[pos] == " ":
 503                     column += 1
 504                 elif line[pos] == "\t":
 505                     column = (column // tabsize + 1) * tabsize
 506                 elif line[pos] == "\f":
 507                     column = 0
 508                 else:
 509                     break
 510                 pos += 1
 511             if pos == max:
 512                 break
 513
 514             if stashed:
 515                 yield stashed
 516                 stashed = None
 517
 518             if line[pos] in "\r\n":  # skip blank lines
 519                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
 520                 continue
 521
 522             if line[pos] == "#":  # skip comments
 523                 comment_token = line[pos:].rstrip("\r\n")
 524                 nl_pos = pos + len(comment_token)
 525                 yield (
 526                     COMMENT,
 527                     comment_token,
 528                     (lnum, pos),
 529                     (lnum, nl_pos),
 530                     line,
 531                 )
 532                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
 533                 continue
 534
 535             if column > indents[-1]:  # count indents
 536                 indents.append(column)
 537                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 538
 539             while column < indents[-1]:  # count dedents
 540                 if column not in indents:
 541                     raise IndentationError(
 542                         "unindent does not match any outer indentation level",
 543                         ("<tokenize>", lnum, pos, line),
 544                     )
 545                 indents = indents[:-1]
 546
 547                 if async_def and async_def_indent >= indents[-1]:
 548                     async_def = False
 549                     async_def_nl = False
 550                     async_def_indent = 0
 551
 552                 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
 553
 554             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 555                 async_def = False
 556                 async_def_nl = False
 557                 async_def_indent = 0
 558
 559         else:  # continued statement
 560             if not line:
 561                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 562             continued = 0
 563
 564         while pos < max:
 565             pseudomatch = pseudoprog.match(line, pos)
 566             if pseudomatch:  # scan for tokens
 567                 start, end = pseudomatch.span(1)
 568                 spos, epos, pos = (lnum, start), (lnum, end), end
 569                 token, initial = line[start:end], line[start]
 570
 571                 if initial in numchars or (
 572                     initial == "." and token != "."
 573                 ):  # ordinary number
 574                     yield (NUMBER, token, spos, epos, line)
 575                 elif initial in "\r\n":
 576                     newline = NEWLINE
 577                     if parenlev > 0:
 578                         newline = NL
 579                     elif async_def:
 580                         async_def_nl = True
 581                     if stashed:
 582                         yield stashed
 583                         stashed = None
 584                     yield (newline, token, spos, epos, line)
 585
 586                 elif initial == "#":
 587                     assert not token.endswith("\n")
 588                     if stashed:
 589                         yield stashed
 590                         stashed = None
 591                     yield (COMMENT, token, spos, epos, line)
 592                 elif token in triple_quoted:
 593                     endprog = endprogs[token]
 594                     endmatch = endprog.match(line, pos)
 595                     if endmatch:  # all on one line
 596                         pos = endmatch.end(0)
 597                         token = line[start:pos]
 598                         if stashed:
 599                             yield stashed
 600                             stashed = None
 601                         yield (STRING, token, spos, (lnum, pos), line)
 602                     else:
 603                         strstart = (lnum, start)  # multiple lines
 604                         contstr = line[start:]
 605                         contline = line
 606                         break
 607                 elif (
 608                     initial in single_quoted
 609                     or token[:2] in single_quoted
 610                     or token[:3] in single_quoted
 611                 ):
 612                     if token[-1] == "\n":  # continued string
 613                         strstart = (lnum, start)
 614                         maybe_endprog = (
 615                             endprogs.get(initial)
 616                             or endprogs.get(token[1])
 617                             or endprogs.get(token[2])
 618                         )
 619                         assert (
 620                             maybe_endprog is not None
 621                         ), f"endprog not found for {token}"
 622                         endprog = maybe_endprog
 623                         contstr, needcont = line[start:], 1
 624                         contline = line
 625                         break
 626                     else:  # ordinary string
 627                         if stashed:
 628                             yield stashed
 629                             stashed = None
 630                         yield (STRING, token, spos, epos, line)
 631                 elif initial.isidentifier():  # ordinary name
 632                     if token in ("async", "await"):
 633                         if async_keywords or async_def:
 634                             yield (
 635                                 ASYNC if token == "async" else AWAIT,
 636                                 token,
 637                                 spos,
 638                                 epos,
 639                                 line,
 640                             )
 641                             continue
 642
 643                     tok = (NAME, token, spos, epos, line)
 644                     if token == "async" and not stashed:
 645                         stashed = tok
 646                         continue
 647
 648                     if token in ("def", "for"):
 649                         if stashed and stashed[0] == NAME and stashed[1] == "async":
 650                             if token == "def":
 651                                 async_def = True
 652                                 async_def_indent = indents[-1]
 653
 654                             yield (
 655                                 ASYNC,
 656                                 stashed[1],
 657                                 stashed[2],
 658                                 stashed[3],
 659                                 stashed[4],
 660                             )
 661                             stashed = None
 662
 663                     if stashed:
 664                         yield stashed
 665                         stashed = None
 666
 667                     yield tok
 668                 elif initial == "\\":  # continued stmt
 669                     # This yield is new; needed for better idempotency:
 670                     if stashed:
 671                         yield stashed
 672                         stashed = None
 673                     yield (NL, token, spos, (lnum, pos), line)
 674                     continued = 1
 675                 else:
 676                     if initial in "([{":
 677                         parenlev += 1
 678                     elif initial in ")]}":
 679                         parenlev -= 1
 680                     if stashed:
 681                         yield stashed
 682                         stashed = None
 683                     yield (OP, token, spos, epos, line)
 684             else:
 685                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
 686                 pos += 1
 687
 688     if stashed:
 689         yield stashed
 690         stashed = None
 691
 692     for _indent in indents[1:]:  # pop remaining indent levels
 693         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
 694     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 695
 696
 697 if __name__ == "__main__":  # testing
 698     if len(sys.argv) > 1:
 699         tokenize(open(sys.argv[1]).readline)
 700     else:
 701         tokenize(sys.stdin.readline)