All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
45 from typing import Final
47 from blib2to3.pgen2.token import *
48 from blib2to3.pgen2.grammar import Grammar
50 __author__ = "Ka-Ping Yee <ping@lfw.org>"
51 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
54 from codecs import BOM_UTF8, lookup
55 from blib2to3.pgen2.token import *
59 __all__ = [x for x in dir(token) if x[0] != "_"] + [
67 def group(*choices: str) -> str:
68 return "(" + "|".join(choices) + ")"
71 def any(*choices: str) -> str:
72 return group(*choices) + "*"
75 def maybe(*choices: str) -> str:
76 return group(*choices) + "?"
79 def _combinations(*l: str) -> Set[str]:
80 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
83 Whitespace = r"[ \f\t]*"
84 Comment = r"#[^\r\n]*"
85 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
86 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
87 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
90 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
91 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
92 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
93 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
94 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
95 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
96 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
99 Expfloat = r"\d+(?:_\d+)*" + Exponent
100 Floatnumber = group(Pointfloat, Expfloat)
101 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
102 Number = group(Imagnumber, Floatnumber, Intnumber)
104 # Tail end of ' string.
105 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
106 # Tail end of " string.
107 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
108 # Tail end of ''' string.
109 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
110 # Tail end of """ string.
111 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
112 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
113 Triple = group(_litprefix + "'''", _litprefix + '"""')
114 # Single-line ' or " string.
116 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
117 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
120 # Because of leftmost-then-longest match semantics, be sure to put the
121 # longest operators first (e.g., if = came before ==, == would get
122 # recognized as two instances of =).
131 r"[+\-*/%&@|^=<>:]=?",
136 Special = group(r"\r?\n", r"[:;.,`@]")
137 Funny = group(Operator, Bracket, Special)
139 # First (or only) line of ' or " string.
141 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
142 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
144 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
145 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
147 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
148 single3prog = re.compile(Single3)
149 double3prog = re.compile(Double3)
152 _combinations("r", "R", "f", "F")
153 | _combinations("r", "R", "b", "B")
154 | {"u", "U", "ur", "uR", "Ur", "UR"}
158 "'": re.compile(Single),
159 '"': re.compile(Double),
162 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
163 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
166 triple_quoted: Final = (
168 | {f"{prefix}'''" for prefix in _strprefixes}
169 | {f'{prefix}"""' for prefix in _strprefixes}
171 single_quoted: Final = (
173 | {f"{prefix}'" for prefix in _strprefixes}
174 | {f'{prefix}"' for prefix in _strprefixes}
180 class TokenError(Exception):
184 class StopTokenizing(Exception):
188 Coord = Tuple[int, int]
192 type: int, token: Text, srow_col: Coord, erow_col: Coord, line: Text
193 ) -> None: # for testing
194 (srow, scol) = srow_col
195 (erow, ecol) = erow_col
197 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
201 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
204 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
206 The tokenize() function accepts two parameters: one representing the
207 input stream, and one providing an output mechanism for tokenize().
209 The first parameter, readline, must be a callable object which provides
210 the same interface as the readline() method of built-in file objects.
211 Each call to the function should return one line of input as a string.
213 The second parameter, tokeneater, must also be a callable object. It is
214 called once for each token, with five arguments, corresponding to the
215 tuples generated by generate_tokens().
218 tokenize_loop(readline, tokeneater)
219 except StopTokenizing:
223 # backwards compatible interface
224 def tokenize_loop(readline: Callable[[], Text], tokeneater: TokenEater) -> None:
225 for token_info in generate_tokens(readline):
226 tokeneater(*token_info)
229 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
230 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
238 def __init__(self) -> None:
243 def add_whitespace(self, start: Coord) -> None:
245 assert row <= self.prev_row
246 col_offset = col - self.prev_col
248 self.tokens.append(" " * col_offset)
250 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
253 self.compat(cast(Tuple[int, str], t), iterable)
255 tok_type, token, start, end, line = cast(
256 Tuple[int, Text, Coord, Coord, Text], t
258 self.add_whitespace(start)
259 self.tokens.append(token)
260 self.prev_row, self.prev_col = end
261 if tok_type in (NEWLINE, NL):
264 return "".join(self.tokens)
266 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
269 toks_append = self.tokens.append
270 toknum, tokval = token
271 if toknum in (NAME, NUMBER):
273 if toknum in (NEWLINE, NL):
276 toknum, tokval = tok[:2]
278 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
282 indents.append(tokval)
284 elif toknum == DEDENT:
287 elif toknum in (NEWLINE, NL):
289 elif startline and indents:
290 toks_append(indents[-1])
295 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
296 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
299 def _get_normal_name(orig_enc: str) -> str:
300 """Imitates get_normal_name in tokenizer.c."""
301 # Only care about the first 12 characters.
302 enc = orig_enc[:12].lower().replace("_", "-")
303 if enc == "utf-8" or enc.startswith("utf-8-"):
305 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
306 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
312 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
314 The detect_encoding() function is used to detect the encoding that should
315 be used to decode a Python source file. It requires one argument, readline,
316 in the same way as the tokenize() generator.
318 It will call readline a maximum of twice, and return the encoding used
319 (as a string) and a list of any lines (left as bytes) it has read
322 It detects the encoding from the presence of a utf-8 bom or an encoding
323 cookie as specified in pep-0263. If both a bom and a cookie are present, but
324 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
325 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
326 'utf-8-sig' is returned.
328 If no encoding is specified, then the default of 'utf-8' will be returned.
334 def read_or_stop() -> bytes:
337 except StopIteration:
340 def find_cookie(line: bytes) -> Optional[str]:
342 line_string = line.decode("ascii")
343 except UnicodeDecodeError:
345 match = cookie_re.match(line_string)
348 encoding = _get_normal_name(match.group(1))
350 codec = lookup(encoding)
352 # This behaviour mimics the Python interpreter
353 raise SyntaxError("unknown encoding: " + encoding)
356 if codec.name != "utf-8":
357 # This behaviour mimics the Python interpreter
358 raise SyntaxError("encoding problem: utf-8")
362 first = read_or_stop()
363 if first.startswith(BOM_UTF8):
366 default = "utf-8-sig"
370 encoding = find_cookie(first)
372 return encoding, [first]
373 if not blank_re.match(first):
374 return default, [first]
376 second = read_or_stop()
378 return default, [first]
380 encoding = find_cookie(second)
382 return encoding, [first, second]
384 return default, [first, second]
387 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
388 """Transform tokens back into Python source code.
390 Each element returned by the iterable must be a token sequence
391 with at least two elements, a token number and token value. If
392 only two tokens are passed, the resulting output is poor.
394 Round-trip invariant for full input:
395 Untokenized source will match input source exactly
397 Round-trip invariant for limited input:
398 # Output text will tokenize the back to the input
399 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
400 newcode = untokenize(t1)
401 readline = iter(newcode.splitlines(1)).next
402 t2 = [tok[:2] for tokin generate_tokens(readline)]
406 return ut.untokenize(iterable)
410 readline: Callable[[], Text], grammar: Optional[Grammar] = None
411 ) -> Iterator[GoodTokenInfo]:
413 The generate_tokens() generator requires one argument, readline, which
414 must be a callable object which provides the same interface as the
415 readline() method of built-in file objects. Each call to the function
416 should return one line of input as a string. Alternately, readline
417 can be a callable function terminating with StopIteration:
418 readline = open(myfile).next # Example of alternate readline
420 The generator produces 5-tuples with these members: the token type; the
421 token string; a 2-tuple (srow, scol) of ints specifying the row and
422 column where the token begins in the source; a 2-tuple (erow, ecol) of
423 ints specifying the row and column where the token ends in the source;
424 and the line on which the token was found. The line passed is the
425 logical line; continuation lines are included.
427 lnum = parenlev = continued = 0
428 numchars: Final[str] = "0123456789"
429 contstr, needcont = "", 0
430 contline: Optional[str] = None
433 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
434 # `await` as keywords.
435 async_keywords = False if grammar is None else grammar.async_keywords
436 # 'stashed' and 'async_*' are used for async/await parsing
437 stashed: Optional[GoodTokenInfo] = None
442 strstart: Tuple[int, int]
443 endprog: Pattern[str]
445 while 1: # loop over lines in stream
448 except StopIteration:
451 pos, max = 0, len(line)
453 if contstr: # continued string
454 assert contline is not None
456 raise TokenError("EOF in multi-line string", strstart)
457 endmatch = endprog.match(line)
459 pos = end = endmatch.end(0)
462 contstr + line[:end],
467 contstr, needcont = "", 0
469 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
481 contstr = contstr + line
482 contline = contline + line
485 elif parenlev == 0 and not continued: # new statement
489 while pos < max: # measure leading whitespace
492 elif line[pos] == "\t":
493 column = (column // tabsize + 1) * tabsize
494 elif line[pos] == "\f":
506 if line[pos] in "\r\n": # skip blank lines
507 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
510 if line[pos] == "#": # skip comments
511 comment_token = line[pos:].rstrip("\r\n")
512 nl_pos = pos + len(comment_token)
520 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
523 if column > indents[-1]: # count indents
524 indents.append(column)
525 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
527 while column < indents[-1]: # count dedents
528 if column not in indents:
529 raise IndentationError(
530 "unindent does not match any outer indentation level",
531 ("<tokenize>", lnum, pos, line),
533 indents = indents[:-1]
535 if async_def and async_def_indent >= indents[-1]:
540 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
542 if async_def and async_def_nl and async_def_indent >= indents[-1]:
547 else: # continued statement
549 raise TokenError("EOF in multi-line statement", (lnum, 0))
553 pseudomatch = pseudoprog.match(line, pos)
554 if pseudomatch: # scan for tokens
555 start, end = pseudomatch.span(1)
556 spos, epos, pos = (lnum, start), (lnum, end), end
557 token, initial = line[start:end], line[start]
559 if initial in numchars or (
560 initial == "." and token != "."
562 yield (NUMBER, token, spos, epos, line)
563 elif initial in "\r\n":
572 yield (newline, token, spos, epos, line)
575 assert not token.endswith("\n")
579 yield (COMMENT, token, spos, epos, line)
580 elif token in triple_quoted:
581 endprog = endprogs[token]
582 endmatch = endprog.match(line, pos)
583 if endmatch: # all on one line
584 pos = endmatch.end(0)
585 token = line[start:pos]
589 yield (STRING, token, spos, (lnum, pos), line)
591 strstart = (lnum, start) # multiple lines
592 contstr = line[start:]
596 initial in single_quoted
597 or token[:2] in single_quoted
598 or token[:3] in single_quoted
600 if token[-1] == "\n": # continued string
601 strstart = (lnum, start)
603 endprogs.get(initial)
604 or endprogs.get(token[1])
605 or endprogs.get(token[2])
608 maybe_endprog is not None
609 ), f"endprog not found for {token}"
610 endprog = maybe_endprog
611 contstr, needcont = line[start:], 1
614 else: # ordinary string
618 yield (STRING, token, spos, epos, line)
619 elif initial.isidentifier(): # ordinary name
620 if token in ("async", "await"):
621 if async_keywords or async_def:
623 ASYNC if token == "async" else AWAIT,
631 tok = (NAME, token, spos, epos, line)
632 if token == "async" and not stashed:
636 if token in ("def", "for"):
637 if stashed and stashed[0] == NAME and stashed[1] == "async":
640 async_def_indent = indents[-1]
656 elif initial == "\\": # continued stmt
657 # This yield is new; needed for better idempotency:
661 yield (NL, token, spos, (lnum, pos), line)
666 elif initial in ")]}":
671 yield (OP, token, spos, epos, line)
673 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
680 for indent in indents[1:]: # pop remaining indent levels
681 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
682 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
685 if __name__ == "__main__": # testing
688 if len(sys.argv) > 1:
689 tokenize(open(sys.argv[1]).readline)
691 tokenize(sys.stdin.readline)