All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
45 if sys.version_info >= (3, 8):
46 from typing import Final
48 from typing_extensions import Final
50 from blib2to3.pgen2.token import *
51 from blib2to3.pgen2.grammar import Grammar
53 __author__ = "Ka-Ping Yee <ping@lfw.org>"
54 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
57 from codecs import BOM_UTF8, lookup
58 from blib2to3.pgen2.token import *
62 __all__ = [x for x in dir(token) if x[0] != "_"] + [
70 def group(*choices: str) -> str:
71 return "(" + "|".join(choices) + ")"
74 def any(*choices: str) -> str:
75 return group(*choices) + "*"
78 def maybe(*choices: str) -> str:
79 return group(*choices) + "?"
82 def _combinations(*l: str) -> Set[str]:
83 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
86 Whitespace = r"[ \f\t]*"
87 Comment = r"#[^\r\n]*"
88 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
89 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
90 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
93 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
94 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
95 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
96 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
97 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
98 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
99 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
102 Expfloat = r"\d+(?:_\d+)*" + Exponent
103 Floatnumber = group(Pointfloat, Expfloat)
104 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
105 Number = group(Imagnumber, Floatnumber, Intnumber)
107 # Tail end of ' string.
108 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
109 # Tail end of " string.
110 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
111 # Tail end of ''' string.
112 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
113 # Tail end of """ string.
114 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
115 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
116 Triple = group(_litprefix + "'''", _litprefix + '"""')
117 # Single-line ' or " string.
119 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
120 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
123 # Because of leftmost-then-longest match semantics, be sure to put the
124 # longest operators first (e.g., if = came before ==, == would get
125 # recognized as two instances of =).
134 r"[+\-*/%&@|^=<>:]=?",
139 Special = group(r"\r?\n", r"[:;.,`@]")
140 Funny = group(Operator, Bracket, Special)
142 # First (or only) line of ' or " string.
144 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
145 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
147 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
148 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
150 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
151 single3prog = re.compile(Single3)
152 double3prog = re.compile(Double3)
155 _combinations("r", "R", "f", "F")
156 | _combinations("r", "R", "b", "B")
157 | {"u", "U", "ur", "uR", "Ur", "UR"}
161 "'": re.compile(Single),
162 '"': re.compile(Double),
165 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
166 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
169 triple_quoted: Final = (
171 | {f"{prefix}'''" for prefix in _strprefixes}
172 | {f'{prefix}"""' for prefix in _strprefixes}
174 single_quoted: Final = (
176 | {f"{prefix}'" for prefix in _strprefixes}
177 | {f'{prefix}"' for prefix in _strprefixes}
183 class TokenError(Exception):
187 class StopTokenizing(Exception):
191 Coord = Tuple[int, int]
195 type: int, token: Text, srow_col: Coord, erow_col: Coord, line: Text
196 ) -> None: # for testing
197 (srow, scol) = srow_col
198 (erow, ecol) = erow_col
200 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
204 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
207 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
209 The tokenize() function accepts two parameters: one representing the
210 input stream, and one providing an output mechanism for tokenize().
212 The first parameter, readline, must be a callable object which provides
213 the same interface as the readline() method of built-in file objects.
214 Each call to the function should return one line of input as a string.
216 The second parameter, tokeneater, must also be a callable object. It is
217 called once for each token, with five arguments, corresponding to the
218 tuples generated by generate_tokens().
221 tokenize_loop(readline, tokeneater)
222 except StopTokenizing:
226 # backwards compatible interface
227 def tokenize_loop(readline: Callable[[], Text], tokeneater: TokenEater) -> None:
228 for token_info in generate_tokens(readline):
229 tokeneater(*token_info)
232 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
233 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
241 def __init__(self) -> None:
246 def add_whitespace(self, start: Coord) -> None:
248 assert row <= self.prev_row
249 col_offset = col - self.prev_col
251 self.tokens.append(" " * col_offset)
253 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
256 self.compat(cast(Tuple[int, str], t), iterable)
258 tok_type, token, start, end, line = cast(
259 Tuple[int, Text, Coord, Coord, Text], t
261 self.add_whitespace(start)
262 self.tokens.append(token)
263 self.prev_row, self.prev_col = end
264 if tok_type in (NEWLINE, NL):
267 return "".join(self.tokens)
269 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
272 toks_append = self.tokens.append
273 toknum, tokval = token
274 if toknum in (NAME, NUMBER):
276 if toknum in (NEWLINE, NL):
279 toknum, tokval = tok[:2]
281 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
285 indents.append(tokval)
287 elif toknum == DEDENT:
290 elif toknum in (NEWLINE, NL):
292 elif startline and indents:
293 toks_append(indents[-1])
298 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
299 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
302 def _get_normal_name(orig_enc: str) -> str:
303 """Imitates get_normal_name in tokenizer.c."""
304 # Only care about the first 12 characters.
305 enc = orig_enc[:12].lower().replace("_", "-")
306 if enc == "utf-8" or enc.startswith("utf-8-"):
308 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
309 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
315 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
317 The detect_encoding() function is used to detect the encoding that should
318 be used to decode a Python source file. It requires one argument, readline,
319 in the same way as the tokenize() generator.
321 It will call readline a maximum of twice, and return the encoding used
322 (as a string) and a list of any lines (left as bytes) it has read
325 It detects the encoding from the presence of a utf-8 bom or an encoding
326 cookie as specified in pep-0263. If both a bom and a cookie are present, but
327 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
328 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
329 'utf-8-sig' is returned.
331 If no encoding is specified, then the default of 'utf-8' will be returned.
337 def read_or_stop() -> bytes:
340 except StopIteration:
343 def find_cookie(line: bytes) -> Optional[str]:
345 line_string = line.decode("ascii")
346 except UnicodeDecodeError:
348 match = cookie_re.match(line_string)
351 encoding = _get_normal_name(match.group(1))
353 codec = lookup(encoding)
355 # This behaviour mimics the Python interpreter
356 raise SyntaxError("unknown encoding: " + encoding)
359 if codec.name != "utf-8":
360 # This behaviour mimics the Python interpreter
361 raise SyntaxError("encoding problem: utf-8")
365 first = read_or_stop()
366 if first.startswith(BOM_UTF8):
369 default = "utf-8-sig"
373 encoding = find_cookie(first)
375 return encoding, [first]
376 if not blank_re.match(first):
377 return default, [first]
379 second = read_or_stop()
381 return default, [first]
383 encoding = find_cookie(second)
385 return encoding, [first, second]
387 return default, [first, second]
390 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
391 """Transform tokens back into Python source code.
393 Each element returned by the iterable must be a token sequence
394 with at least two elements, a token number and token value. If
395 only two tokens are passed, the resulting output is poor.
397 Round-trip invariant for full input:
398 Untokenized source will match input source exactly
400 Round-trip invariant for limited input:
401 # Output text will tokenize the back to the input
402 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
403 newcode = untokenize(t1)
404 readline = iter(newcode.splitlines(1)).next
405 t2 = [tok[:2] for tokin generate_tokens(readline)]
409 return ut.untokenize(iterable)
413 readline: Callable[[], Text], grammar: Optional[Grammar] = None
414 ) -> Iterator[GoodTokenInfo]:
416 The generate_tokens() generator requires one argument, readline, which
417 must be a callable object which provides the same interface as the
418 readline() method of built-in file objects. Each call to the function
419 should return one line of input as a string. Alternately, readline
420 can be a callable function terminating with StopIteration:
421 readline = open(myfile).next # Example of alternate readline
423 The generator produces 5-tuples with these members: the token type; the
424 token string; a 2-tuple (srow, scol) of ints specifying the row and
425 column where the token begins in the source; a 2-tuple (erow, ecol) of
426 ints specifying the row and column where the token ends in the source;
427 and the line on which the token was found. The line passed is the
428 logical line; continuation lines are included.
430 lnum = parenlev = continued = 0
431 numchars: Final[str] = "0123456789"
432 contstr, needcont = "", 0
433 contline: Optional[str] = None
436 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
437 # `await` as keywords.
438 async_keywords = False if grammar is None else grammar.async_keywords
439 # 'stashed' and 'async_*' are used for async/await parsing
440 stashed: Optional[GoodTokenInfo] = None
445 strstart: Tuple[int, int]
446 endprog: Pattern[str]
448 while 1: # loop over lines in stream
451 except StopIteration:
454 pos, max = 0, len(line)
456 if contstr: # continued string
457 assert contline is not None
459 raise TokenError("EOF in multi-line string", strstart)
460 endmatch = endprog.match(line)
462 pos = end = endmatch.end(0)
465 contstr + line[:end],
470 contstr, needcont = "", 0
472 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
484 contstr = contstr + line
485 contline = contline + line
488 elif parenlev == 0 and not continued: # new statement
492 while pos < max: # measure leading whitespace
495 elif line[pos] == "\t":
496 column = (column // tabsize + 1) * tabsize
497 elif line[pos] == "\f":
509 if line[pos] in "\r\n": # skip blank lines
510 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
513 if line[pos] == "#": # skip comments
514 comment_token = line[pos:].rstrip("\r\n")
515 nl_pos = pos + len(comment_token)
523 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
526 if column > indents[-1]: # count indents
527 indents.append(column)
528 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
530 while column < indents[-1]: # count dedents
531 if column not in indents:
532 raise IndentationError(
533 "unindent does not match any outer indentation level",
534 ("<tokenize>", lnum, pos, line),
536 indents = indents[:-1]
538 if async_def and async_def_indent >= indents[-1]:
543 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
545 if async_def and async_def_nl and async_def_indent >= indents[-1]:
550 else: # continued statement
552 raise TokenError("EOF in multi-line statement", (lnum, 0))
556 pseudomatch = pseudoprog.match(line, pos)
557 if pseudomatch: # scan for tokens
558 start, end = pseudomatch.span(1)
559 spos, epos, pos = (lnum, start), (lnum, end), end
560 token, initial = line[start:end], line[start]
562 if initial in numchars or (
563 initial == "." and token != "."
565 yield (NUMBER, token, spos, epos, line)
566 elif initial in "\r\n":
575 yield (newline, token, spos, epos, line)
578 assert not token.endswith("\n")
582 yield (COMMENT, token, spos, epos, line)
583 elif token in triple_quoted:
584 endprog = endprogs[token]
585 endmatch = endprog.match(line, pos)
586 if endmatch: # all on one line
587 pos = endmatch.end(0)
588 token = line[start:pos]
592 yield (STRING, token, spos, (lnum, pos), line)
594 strstart = (lnum, start) # multiple lines
595 contstr = line[start:]
599 initial in single_quoted
600 or token[:2] in single_quoted
601 or token[:3] in single_quoted
603 if token[-1] == "\n": # continued string
604 strstart = (lnum, start)
606 endprogs.get(initial)
607 or endprogs.get(token[1])
608 or endprogs.get(token[2])
611 maybe_endprog is not None
612 ), f"endprog not found for {token}"
613 endprog = maybe_endprog
614 contstr, needcont = line[start:], 1
617 else: # ordinary string
621 yield (STRING, token, spos, epos, line)
622 elif initial.isidentifier(): # ordinary name
623 if token in ("async", "await"):
624 if async_keywords or async_def:
626 ASYNC if token == "async" else AWAIT,
634 tok = (NAME, token, spos, epos, line)
635 if token == "async" and not stashed:
639 if token in ("def", "for"):
640 if stashed and stashed[0] == NAME and stashed[1] == "async":
643 async_def_indent = indents[-1]
659 elif initial == "\\": # continued stmt
660 # This yield is new; needed for better idempotency:
664 yield (NL, token, spos, (lnum, pos), line)
669 elif initial in ")]}":
674 yield (OP, token, spos, epos, line)
676 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
683 for indent in indents[1:]: # pop remaining indent levels
684 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
685 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
688 if __name__ == "__main__": # testing
691 if len(sys.argv) > 1:
692 tokenize(open(sys.argv[1]).readline)
694 tokenize(sys.stdin.readline)