All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
44 from typing import Final
46 from blib2to3.pgen2.token import *
47 from blib2to3.pgen2.grammar import Grammar
49 __author__ = "Ka-Ping Yee <ping@lfw.org>"
50 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
53 from codecs import BOM_UTF8, lookup
54 from blib2to3.pgen2.token import *
58 __all__ = [x for x in dir(token) if x[0] != "_"] + [
66 def group(*choices: str) -> str:
67 return "(" + "|".join(choices) + ")"
70 def any(*choices: str) -> str:
71 return group(*choices) + "*"
74 def maybe(*choices: str) -> str:
75 return group(*choices) + "?"
78 def _combinations(*l: str) -> Set[str]:
79 return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
82 Whitespace = r"[ \f\t]*"
83 Comment = r"#[^\r\n]*"
84 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
85 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
86 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
89 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
90 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
91 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
92 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
93 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
94 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
95 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
98 Expfloat = r"\d+(?:_\d+)*" + Exponent
99 Floatnumber = group(Pointfloat, Expfloat)
100 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
101 Number = group(Imagnumber, Floatnumber, Intnumber)
103 # Tail end of ' string.
104 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
105 # Tail end of " string.
106 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
107 # Tail end of ''' string.
108 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
109 # Tail end of """ string.
110 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
111 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
112 Triple = group(_litprefix + "'''", _litprefix + '"""')
113 # Single-line ' or " string.
115 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
116 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
119 # Because of leftmost-then-longest match semantics, be sure to put the
120 # longest operators first (e.g., if = came before ==, == would get
121 # recognized as two instances of =).
130 r"[+\-*/%&@|^=<>:]=?",
135 Special = group(r"\r?\n", r"[:;.,`@]")
136 Funny = group(Operator, Bracket, Special)
138 # First (or only) line of ' or " string.
140 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
141 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
143 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
144 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
146 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
147 single3prog = re.compile(Single3)
148 double3prog = re.compile(Double3)
151 _combinations("r", "R", "f", "F")
152 | _combinations("r", "R", "b", "B")
153 | {"u", "U", "ur", "uR", "Ur", "UR"}
157 "'": re.compile(Single),
158 '"': re.compile(Double),
161 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
162 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
165 triple_quoted: Final = (
167 | {f"{prefix}'''" for prefix in _strprefixes}
168 | {f'{prefix}"""' for prefix in _strprefixes}
170 single_quoted: Final = (
172 | {f"{prefix}'" for prefix in _strprefixes}
173 | {f'{prefix}"' for prefix in _strprefixes}
179 class TokenError(Exception):
183 class StopTokenizing(Exception):
187 Coord = Tuple[int, int]
191 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
192 ) -> None: # for testing
193 (srow, scol) = srow_col
194 (erow, ecol) = erow_col
196 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
200 TokenEater = Callable[[int, str, Coord, Coord, str], None]
203 def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
205 The tokenize() function accepts two parameters: one representing the
206 input stream, and one providing an output mechanism for tokenize().
208 The first parameter, readline, must be a callable object which provides
209 the same interface as the readline() method of built-in file objects.
210 Each call to the function should return one line of input as a string.
212 The second parameter, tokeneater, must also be a callable object. It is
213 called once for each token, with five arguments, corresponding to the
214 tuples generated by generate_tokens().
217 tokenize_loop(readline, tokeneater)
218 except StopTokenizing:
222 # backwards compatible interface
223 def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
224 for token_info in generate_tokens(readline):
225 tokeneater(*token_info)
228 GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
229 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
237 def __init__(self) -> None:
242 def add_whitespace(self, start: Coord) -> None:
244 assert row <= self.prev_row
245 col_offset = col - self.prev_col
247 self.tokens.append(" " * col_offset)
249 def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
252 self.compat(cast(Tuple[int, str], t), iterable)
254 tok_type, token, start, end, line = cast(
255 Tuple[int, str, Coord, Coord, str], t
257 self.add_whitespace(start)
258 self.tokens.append(token)
259 self.prev_row, self.prev_col = end
260 if tok_type in (NEWLINE, NL):
263 return "".join(self.tokens)
265 def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
268 toks_append = self.tokens.append
269 toknum, tokval = token
270 if toknum in (NAME, NUMBER):
272 if toknum in (NEWLINE, NL):
275 toknum, tokval = tok[:2]
277 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
281 indents.append(tokval)
283 elif toknum == DEDENT:
286 elif toknum in (NEWLINE, NL):
288 elif startline and indents:
289 toks_append(indents[-1])
294 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
295 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
298 def _get_normal_name(orig_enc: str) -> str:
299 """Imitates get_normal_name in tokenizer.c."""
300 # Only care about the first 12 characters.
301 enc = orig_enc[:12].lower().replace("_", "-")
302 if enc == "utf-8" or enc.startswith("utf-8-"):
304 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
305 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
311 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
313 The detect_encoding() function is used to detect the encoding that should
314 be used to decode a Python source file. It requires one argument, readline,
315 in the same way as the tokenize() generator.
317 It will call readline a maximum of twice, and return the encoding used
318 (as a string) and a list of any lines (left as bytes) it has read
321 It detects the encoding from the presence of a utf-8 bom or an encoding
322 cookie as specified in pep-0263. If both a bom and a cookie are present, but
323 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
324 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
325 'utf-8-sig' is returned.
327 If no encoding is specified, then the default of 'utf-8' will be returned.
333 def read_or_stop() -> bytes:
336 except StopIteration:
339 def find_cookie(line: bytes) -> Optional[str]:
341 line_string = line.decode("ascii")
342 except UnicodeDecodeError:
344 match = cookie_re.match(line_string)
347 encoding = _get_normal_name(match.group(1))
349 codec = lookup(encoding)
351 # This behaviour mimics the Python interpreter
352 raise SyntaxError("unknown encoding: " + encoding)
355 if codec.name != "utf-8":
356 # This behaviour mimics the Python interpreter
357 raise SyntaxError("encoding problem: utf-8")
361 first = read_or_stop()
362 if first.startswith(BOM_UTF8):
365 default = "utf-8-sig"
369 encoding = find_cookie(first)
371 return encoding, [first]
372 if not blank_re.match(first):
373 return default, [first]
375 second = read_or_stop()
377 return default, [first]
379 encoding = find_cookie(second)
381 return encoding, [first, second]
383 return default, [first, second]
386 def untokenize(iterable: Iterable[TokenInfo]) -> str:
387 """Transform tokens back into Python source code.
389 Each element returned by the iterable must be a token sequence
390 with at least two elements, a token number and token value. If
391 only two tokens are passed, the resulting output is poor.
393 Round-trip invariant for full input:
394 Untokenized source will match input source exactly
396 Round-trip invariant for limited input:
397 # Output text will tokenize the back to the input
398 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
399 newcode = untokenize(t1)
400 readline = iter(newcode.splitlines(1)).next
401 t2 = [tok[:2] for tokin generate_tokens(readline)]
405 return ut.untokenize(iterable)
409 readline: Callable[[], str], grammar: Optional[Grammar] = None
410 ) -> Iterator[GoodTokenInfo]:
412 The generate_tokens() generator requires one argument, readline, which
413 must be a callable object which provides the same interface as the
414 readline() method of built-in file objects. Each call to the function
415 should return one line of input as a string. Alternately, readline
416 can be a callable function terminating with StopIteration:
417 readline = open(myfile).next # Example of alternate readline
419 The generator produces 5-tuples with these members: the token type; the
420 token string; a 2-tuple (srow, scol) of ints specifying the row and
421 column where the token begins in the source; a 2-tuple (erow, ecol) of
422 ints specifying the row and column where the token ends in the source;
423 and the line on which the token was found. The line passed is the
424 logical line; continuation lines are included.
426 lnum = parenlev = continued = 0
427 numchars: Final[str] = "0123456789"
428 contstr, needcont = "", 0
429 contline: Optional[str] = None
432 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
433 # `await` as keywords.
434 async_keywords = False if grammar is None else grammar.async_keywords
435 # 'stashed' and 'async_*' are used for async/await parsing
436 stashed: Optional[GoodTokenInfo] = None
441 strstart: Tuple[int, int]
442 endprog: Pattern[str]
444 while 1: # loop over lines in stream
447 except StopIteration:
450 pos, max = 0, len(line)
452 if contstr: # continued string
453 assert contline is not None
455 raise TokenError("EOF in multi-line string", strstart)
456 endmatch = endprog.match(line)
458 pos = end = endmatch.end(0)
461 contstr + line[:end],
466 contstr, needcont = "", 0
468 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
480 contstr = contstr + line
481 contline = contline + line
484 elif parenlev == 0 and not continued: # new statement
488 while pos < max: # measure leading whitespace
491 elif line[pos] == "\t":
492 column = (column // tabsize + 1) * tabsize
493 elif line[pos] == "\f":
505 if line[pos] in "\r\n": # skip blank lines
506 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
509 if line[pos] == "#": # skip comments
510 comment_token = line[pos:].rstrip("\r\n")
511 nl_pos = pos + len(comment_token)
519 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
522 if column > indents[-1]: # count indents
523 indents.append(column)
524 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
526 while column < indents[-1]: # count dedents
527 if column not in indents:
528 raise IndentationError(
529 "unindent does not match any outer indentation level",
530 ("<tokenize>", lnum, pos, line),
532 indents = indents[:-1]
534 if async_def and async_def_indent >= indents[-1]:
539 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
541 if async_def and async_def_nl and async_def_indent >= indents[-1]:
546 else: # continued statement
548 raise TokenError("EOF in multi-line statement", (lnum, 0))
552 pseudomatch = pseudoprog.match(line, pos)
553 if pseudomatch: # scan for tokens
554 start, end = pseudomatch.span(1)
555 spos, epos, pos = (lnum, start), (lnum, end), end
556 token, initial = line[start:end], line[start]
558 if initial in numchars or (
559 initial == "." and token != "."
561 yield (NUMBER, token, spos, epos, line)
562 elif initial in "\r\n":
571 yield (newline, token, spos, epos, line)
574 assert not token.endswith("\n")
578 yield (COMMENT, token, spos, epos, line)
579 elif token in triple_quoted:
580 endprog = endprogs[token]
581 endmatch = endprog.match(line, pos)
582 if endmatch: # all on one line
583 pos = endmatch.end(0)
584 token = line[start:pos]
588 yield (STRING, token, spos, (lnum, pos), line)
590 strstart = (lnum, start) # multiple lines
591 contstr = line[start:]
595 initial in single_quoted
596 or token[:2] in single_quoted
597 or token[:3] in single_quoted
599 if token[-1] == "\n": # continued string
600 strstart = (lnum, start)
602 endprogs.get(initial)
603 or endprogs.get(token[1])
604 or endprogs.get(token[2])
607 maybe_endprog is not None
608 ), f"endprog not found for {token}"
609 endprog = maybe_endprog
610 contstr, needcont = line[start:], 1
613 else: # ordinary string
617 yield (STRING, token, spos, epos, line)
618 elif initial.isidentifier(): # ordinary name
619 if token in ("async", "await"):
620 if async_keywords or async_def:
622 ASYNC if token == "async" else AWAIT,
630 tok = (NAME, token, spos, epos, line)
631 if token == "async" and not stashed:
635 if token in ("def", "for"):
636 if stashed and stashed[0] == NAME and stashed[1] == "async":
639 async_def_indent = indents[-1]
655 elif initial == "\\": # continued stmt
656 # This yield is new; needed for better idempotency:
660 yield (NL, token, spos, (lnum, pos), line)
665 elif initial in ")]}":
670 yield (OP, token, spos, epos, line)
672 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
679 for indent in indents[1:]: # pop remaining indent levels
680 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
681 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
684 if __name__ == "__main__": # testing
687 if len(sys.argv) > 1:
688 tokenize(open(sys.argv[1]).readline)
690 tokenize(sys.stdin.readline)