All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
45 from blib2to3.pgen2.grammar import Grammar
46 from blib2to3.pgen2.token import (
63 __author__ = "Ka-Ping Yee <ping@lfw.org>"
64 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
67 from codecs import BOM_UTF8, lookup
71 __all__ = [x for x in dir(token) if x[0] != "_"] + [
79 def group(*choices: str) -> str:
80 return "(" + "|".join(choices) + ")"
83 def any(*choices: str) -> str:
84 return group(*choices) + "*"
87 def maybe(*choices: str) -> str:
88 return group(*choices) + "?"
91 def _combinations(*l: str) -> Set[str]:
92 return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
95 Whitespace = r"[ \f\t]*"
96 Comment = r"#[^\r\n]*"
97 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
98 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
99 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
102 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
103 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
104 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
105 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
106 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
107 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
108 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
111 Expfloat = r"\d+(?:_\d+)*" + Exponent
112 Floatnumber = group(Pointfloat, Expfloat)
113 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
114 Number = group(Imagnumber, Floatnumber, Intnumber)
116 # Tail end of ' string.
117 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
118 # Tail end of " string.
119 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
120 # Tail end of ''' string.
121 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
122 # Tail end of """ string.
123 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
124 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
125 Triple = group(_litprefix + "'''", _litprefix + '"""')
126 # Single-line ' or " string.
128 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
129 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
132 # Because of leftmost-then-longest match semantics, be sure to put the
133 # longest operators first (e.g., if = came before ==, == would get
134 # recognized as two instances of =).
143 r"[+\-*/%&@|^=<>:]=?",
148 Special = group(r"\r?\n", r"[:;.,`@]")
149 Funny = group(Operator, Bracket, Special)
151 # First (or only) line of ' or " string.
153 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
154 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
156 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
157 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
159 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
160 single3prog = re.compile(Single3)
161 double3prog = re.compile(Double3)
164 _combinations("r", "R", "f", "F")
165 | _combinations("r", "R", "b", "B")
166 | {"u", "U", "ur", "uR", "Ur", "UR"}
170 "'": re.compile(Single),
171 '"': re.compile(Double),
174 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
175 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
178 triple_quoted: Final = (
180 | {f"{prefix}'''" for prefix in _strprefixes}
181 | {f'{prefix}"""' for prefix in _strprefixes}
183 single_quoted: Final = (
185 | {f"{prefix}'" for prefix in _strprefixes}
186 | {f'{prefix}"' for prefix in _strprefixes}
192 class TokenError(Exception):
196 class StopTokenizing(Exception):
200 Coord = Tuple[int, int]
204 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
205 ) -> None: # for testing
206 (srow, scol) = srow_col
207 (erow, ecol) = erow_col
209 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
213 TokenEater = Callable[[int, str, Coord, Coord, str], None]
216 def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
218 The tokenize() function accepts two parameters: one representing the
219 input stream, and one providing an output mechanism for tokenize().
221 The first parameter, readline, must be a callable object which provides
222 the same interface as the readline() method of built-in file objects.
223 Each call to the function should return one line of input as a string.
225 The second parameter, tokeneater, must also be a callable object. It is
226 called once for each token, with five arguments, corresponding to the
227 tuples generated by generate_tokens().
230 tokenize_loop(readline, tokeneater)
231 except StopTokenizing:
235 # backwards compatible interface
236 def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
237 for token_info in generate_tokens(readline):
238 tokeneater(*token_info)
241 GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
242 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
250 def __init__(self) -> None:
255 def add_whitespace(self, start: Coord) -> None:
257 assert row <= self.prev_row
258 col_offset = col - self.prev_col
260 self.tokens.append(" " * col_offset)
262 def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
265 self.compat(cast(Tuple[int, str], t), iterable)
267 tok_type, token, start, end, line = cast(
268 Tuple[int, str, Coord, Coord, str], t
270 self.add_whitespace(start)
271 self.tokens.append(token)
272 self.prev_row, self.prev_col = end
273 if tok_type in (NEWLINE, NL):
276 return "".join(self.tokens)
278 def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
281 toks_append = self.tokens.append
282 toknum, tokval = token
283 if toknum in (NAME, NUMBER):
285 if toknum in (NEWLINE, NL):
288 toknum, tokval = tok[:2]
290 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
294 indents.append(tokval)
296 elif toknum == DEDENT:
299 elif toknum in (NEWLINE, NL):
301 elif startline and indents:
302 toks_append(indents[-1])
307 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
308 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
311 def _get_normal_name(orig_enc: str) -> str:
312 """Imitates get_normal_name in tokenizer.c."""
313 # Only care about the first 12 characters.
314 enc = orig_enc[:12].lower().replace("_", "-")
315 if enc == "utf-8" or enc.startswith("utf-8-"):
317 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
318 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
324 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
326 The detect_encoding() function is used to detect the encoding that should
327 be used to decode a Python source file. It requires one argument, readline,
328 in the same way as the tokenize() generator.
330 It will call readline a maximum of twice, and return the encoding used
331 (as a string) and a list of any lines (left as bytes) it has read
334 It detects the encoding from the presence of a utf-8 bom or an encoding
335 cookie as specified in pep-0263. If both a bom and a cookie are present, but
336 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
337 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
338 'utf-8-sig' is returned.
340 If no encoding is specified, then the default of 'utf-8' will be returned.
346 def read_or_stop() -> bytes:
349 except StopIteration:
352 def find_cookie(line: bytes) -> Optional[str]:
354 line_string = line.decode("ascii")
355 except UnicodeDecodeError:
357 match = cookie_re.match(line_string)
360 encoding = _get_normal_name(match.group(1))
362 codec = lookup(encoding)
364 # This behaviour mimics the Python interpreter
365 raise SyntaxError("unknown encoding: " + encoding)
368 if codec.name != "utf-8":
369 # This behaviour mimics the Python interpreter
370 raise SyntaxError("encoding problem: utf-8")
374 first = read_or_stop()
375 if first.startswith(BOM_UTF8):
378 default = "utf-8-sig"
382 encoding = find_cookie(first)
384 return encoding, [first]
385 if not blank_re.match(first):
386 return default, [first]
388 second = read_or_stop()
390 return default, [first]
392 encoding = find_cookie(second)
394 return encoding, [first, second]
396 return default, [first, second]
399 def untokenize(iterable: Iterable[TokenInfo]) -> str:
400 """Transform tokens back into Python source code.
402 Each element returned by the iterable must be a token sequence
403 with at least two elements, a token number and token value. If
404 only two tokens are passed, the resulting output is poor.
406 Round-trip invariant for full input:
407 Untokenized source will match input source exactly
409 Round-trip invariant for limited input:
410 # Output text will tokenize the back to the input
411 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
412 newcode = untokenize(t1)
413 readline = iter(newcode.splitlines(1)).next
414 t2 = [tok[:2] for tokin generate_tokens(readline)]
418 return ut.untokenize(iterable)
422 readline: Callable[[], str], grammar: Optional[Grammar] = None
423 ) -> Iterator[GoodTokenInfo]:
425 The generate_tokens() generator requires one argument, readline, which
426 must be a callable object which provides the same interface as the
427 readline() method of built-in file objects. Each call to the function
428 should return one line of input as a string. Alternately, readline
429 can be a callable function terminating with StopIteration:
430 readline = open(myfile).next # Example of alternate readline
432 The generator produces 5-tuples with these members: the token type; the
433 token string; a 2-tuple (srow, scol) of ints specifying the row and
434 column where the token begins in the source; a 2-tuple (erow, ecol) of
435 ints specifying the row and column where the token ends in the source;
436 and the line on which the token was found. The line passed is the
437 logical line; continuation lines are included.
439 lnum = parenlev = continued = 0
440 numchars: Final[str] = "0123456789"
441 contstr, needcont = "", 0
442 contline: Optional[str] = None
445 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
446 # `await` as keywords.
447 async_keywords = False if grammar is None else grammar.async_keywords
448 # 'stashed' and 'async_*' are used for async/await parsing
449 stashed: Optional[GoodTokenInfo] = None
454 strstart: Tuple[int, int]
455 endprog: Pattern[str]
457 while 1: # loop over lines in stream
460 except StopIteration:
463 pos, max = 0, len(line)
465 if contstr: # continued string
466 assert contline is not None
468 raise TokenError("EOF in multi-line string", strstart)
469 endmatch = endprog.match(line)
471 pos = end = endmatch.end(0)
474 contstr + line[:end],
479 contstr, needcont = "", 0
481 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
493 contstr = contstr + line
494 contline = contline + line
497 elif parenlev == 0 and not continued: # new statement
501 while pos < max: # measure leading whitespace
504 elif line[pos] == "\t":
505 column = (column // tabsize + 1) * tabsize
506 elif line[pos] == "\f":
518 if line[pos] in "\r\n": # skip blank lines
519 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
522 if line[pos] == "#": # skip comments
523 comment_token = line[pos:].rstrip("\r\n")
524 nl_pos = pos + len(comment_token)
532 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
535 if column > indents[-1]: # count indents
536 indents.append(column)
537 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
539 while column < indents[-1]: # count dedents
540 if column not in indents:
541 raise IndentationError(
542 "unindent does not match any outer indentation level",
543 ("<tokenize>", lnum, pos, line),
545 indents = indents[:-1]
547 if async_def and async_def_indent >= indents[-1]:
552 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
554 if async_def and async_def_nl and async_def_indent >= indents[-1]:
559 else: # continued statement
561 raise TokenError("EOF in multi-line statement", (lnum, 0))
565 pseudomatch = pseudoprog.match(line, pos)
566 if pseudomatch: # scan for tokens
567 start, end = pseudomatch.span(1)
568 spos, epos, pos = (lnum, start), (lnum, end), end
569 token, initial = line[start:end], line[start]
571 if initial in numchars or (
572 initial == "." and token != "."
574 yield (NUMBER, token, spos, epos, line)
575 elif initial in "\r\n":
584 yield (newline, token, spos, epos, line)
587 assert not token.endswith("\n")
591 yield (COMMENT, token, spos, epos, line)
592 elif token in triple_quoted:
593 endprog = endprogs[token]
594 endmatch = endprog.match(line, pos)
595 if endmatch: # all on one line
596 pos = endmatch.end(0)
597 token = line[start:pos]
601 yield (STRING, token, spos, (lnum, pos), line)
603 strstart = (lnum, start) # multiple lines
604 contstr = line[start:]
608 initial in single_quoted
609 or token[:2] in single_quoted
610 or token[:3] in single_quoted
612 if token[-1] == "\n": # continued string
613 strstart = (lnum, start)
615 endprogs.get(initial)
616 or endprogs.get(token[1])
617 or endprogs.get(token[2])
620 maybe_endprog is not None
621 ), f"endprog not found for {token}"
622 endprog = maybe_endprog
623 contstr, needcont = line[start:], 1
626 else: # ordinary string
630 yield (STRING, token, spos, epos, line)
631 elif initial.isidentifier(): # ordinary name
632 if token in ("async", "await"):
633 if async_keywords or async_def:
635 ASYNC if token == "async" else AWAIT,
643 tok = (NAME, token, spos, epos, line)
644 if token == "async" and not stashed:
648 if token in ("def", "for"):
649 if stashed and stashed[0] == NAME and stashed[1] == "async":
652 async_def_indent = indents[-1]
668 elif initial == "\\": # continued stmt
669 # This yield is new; needed for better idempotency:
673 yield (NL, token, spos, (lnum, pos), line)
678 elif initial in ")]}":
683 yield (OP, token, spos, epos, line)
685 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
692 for _indent in indents[1:]: # pop remaining indent levels
693 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
694 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
697 if __name__ == "__main__": # testing
698 if len(sys.argv) > 1:
699 tokenize(open(sys.argv[1]).readline)
701 tokenize(sys.stdin.readline)