All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
44 if sys.version_info >= (3, 8):
45 from typing import Final
47 from typing_extensions import Final
49 from blib2to3.pgen2.token import *
50 from blib2to3.pgen2.grammar import Grammar
52 __author__ = "Ka-Ping Yee <ping@lfw.org>"
53 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
56 from codecs import BOM_UTF8, lookup
57 from blib2to3.pgen2.token import *
61 __all__ = [x for x in dir(token) if x[0] != "_"] + [
70 return "(" + "|".join(choices) + ")"
74 return group(*choices) + "*"
78 return group(*choices) + "?"
81 def _combinations(*l):
82 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
85 Whitespace = r"[ \f\t]*"
86 Comment = r"#[^\r\n]*"
87 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
88 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
89 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
92 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
93 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
94 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
95 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
96 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
97 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
98 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
101 Expfloat = r"\d+(?:_\d+)*" + Exponent
102 Floatnumber = group(Pointfloat, Expfloat)
103 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
104 Number = group(Imagnumber, Floatnumber, Intnumber)
106 # Tail end of ' string.
107 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
108 # Tail end of " string.
109 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
110 # Tail end of ''' string.
111 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
112 # Tail end of """ string.
113 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
114 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
115 Triple = group(_litprefix + "'''", _litprefix + '"""')
116 # Single-line ' or " string.
118 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
119 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
122 # Because of leftmost-then-longest match semantics, be sure to put the
123 # longest operators first (e.g., if = came before ==, == would get
124 # recognized as two instances of =).
133 r"[+\-*/%&@|^=<>:]=?",
138 Special = group(r"\r?\n", r"[:;.,`@]")
139 Funny = group(Operator, Bracket, Special)
141 # First (or only) line of ' or " string.
143 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
144 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
146 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
147 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
149 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
150 single3prog = re.compile(Single3)
151 double3prog = re.compile(Double3)
154 _combinations("r", "R", "f", "F")
155 | _combinations("r", "R", "b", "B")
156 | {"u", "U", "ur", "uR", "Ur", "UR"}
160 "'": re.compile(Single),
161 '"': re.compile(Double),
164 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
165 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
168 triple_quoted: Final = (
170 | {f"{prefix}'''" for prefix in _strprefixes}
171 | {f'{prefix}"""' for prefix in _strprefixes}
173 single_quoted: Final = (
175 | {f"{prefix}'" for prefix in _strprefixes}
176 | {f'{prefix}"' for prefix in _strprefixes}
182 class TokenError(Exception):
186 class StopTokenizing(Exception):
190 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
191 (srow, scol) = xxx_todo_changeme
192 (erow, ecol) = xxx_todo_changeme1
194 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
198 Coord = Tuple[int, int]
199 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
202 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
204 The tokenize() function accepts two parameters: one representing the
205 input stream, and one providing an output mechanism for tokenize().
207 The first parameter, readline, must be a callable object which provides
208 the same interface as the readline() method of built-in file objects.
209 Each call to the function should return one line of input as a string.
211 The second parameter, tokeneater, must also be a callable object. It is
212 called once for each token, with five arguments, corresponding to the
213 tuples generated by generate_tokens().
216 tokenize_loop(readline, tokeneater)
217 except StopTokenizing:
221 # backwards compatible interface
222 def tokenize_loop(readline, tokeneater):
223 for token_info in generate_tokens(readline):
224 tokeneater(*token_info)
227 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
228 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
237 def __init__(self) -> None:
242 def add_whitespace(self, start: Coord) -> None:
244 assert row <= self.prev_row
245 col_offset = col - self.prev_col
247 self.tokens.append(" " * col_offset)
249 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
252 self.compat(cast(Tuple[int, str], t), iterable)
254 tok_type, token, start, end, line = cast(
255 Tuple[int, Text, Coord, Coord, Text], t
257 self.add_whitespace(start)
258 self.tokens.append(token)
259 self.prev_row, self.prev_col = end
260 if tok_type in (NEWLINE, NL):
263 return "".join(self.tokens)
265 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
268 toks_append = self.tokens.append
269 toknum, tokval = token
270 if toknum in (NAME, NUMBER):
272 if toknum in (NEWLINE, NL):
275 toknum, tokval = tok[:2]
277 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
281 indents.append(tokval)
283 elif toknum == DEDENT:
286 elif toknum in (NEWLINE, NL):
288 elif startline and indents:
289 toks_append(indents[-1])
294 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
295 blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
298 def _get_normal_name(orig_enc: str) -> str:
299 """Imitates get_normal_name in tokenizer.c."""
300 # Only care about the first 12 characters.
301 enc = orig_enc[:12].lower().replace("_", "-")
302 if enc == "utf-8" or enc.startswith("utf-8-"):
304 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
305 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
311 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
313 The detect_encoding() function is used to detect the encoding that should
314 be used to decode a Python source file. It requires one argument, readline,
315 in the same way as the tokenize() generator.
317 It will call readline a maximum of twice, and return the encoding used
318 (as a string) and a list of any lines (left as bytes) it has read
321 It detects the encoding from the presence of a utf-8 bom or an encoding
322 cookie as specified in pep-0263. If both a bom and a cookie are present, but
323 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
324 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
325 'utf-8-sig' is returned.
327 If no encoding is specified, then the default of 'utf-8' will be returned.
333 def read_or_stop() -> bytes:
336 except StopIteration:
339 def find_cookie(line: bytes) -> Optional[str]:
341 line_string = line.decode("ascii")
342 except UnicodeDecodeError:
344 match = cookie_re.match(line_string)
347 encoding = _get_normal_name(match.group(1))
349 codec = lookup(encoding)
351 # This behaviour mimics the Python interpreter
352 raise SyntaxError("unknown encoding: " + encoding)
355 if codec.name != "utf-8":
356 # This behaviour mimics the Python interpreter
357 raise SyntaxError("encoding problem: utf-8")
361 first = read_or_stop()
362 if first.startswith(BOM_UTF8):
365 default = "utf-8-sig"
369 encoding = find_cookie(first)
371 return encoding, [first]
372 if not blank_re.match(first):
373 return default, [first]
375 second = read_or_stop()
377 return default, [first]
379 encoding = find_cookie(second)
381 return encoding, [first, second]
383 return default, [first, second]
386 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
387 """Transform tokens back into Python source code.
389 Each element returned by the iterable must be a token sequence
390 with at least two elements, a token number and token value. If
391 only two tokens are passed, the resulting output is poor.
393 Round-trip invariant for full input:
394 Untokenized source will match input source exactly
396 Round-trip invariant for limited input:
397 # Output text will tokenize the back to the input
398 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
399 newcode = untokenize(t1)
400 readline = iter(newcode.splitlines(1)).next
401 t2 = [tok[:2] for tokin generate_tokens(readline)]
405 return ut.untokenize(iterable)
409 readline: Callable[[], Text], grammar: Optional[Grammar] = None
410 ) -> Iterator[GoodTokenInfo]:
412 The generate_tokens() generator requires one argument, readline, which
413 must be a callable object which provides the same interface as the
414 readline() method of built-in file objects. Each call to the function
415 should return one line of input as a string. Alternately, readline
416 can be a callable function terminating with StopIteration:
417 readline = open(myfile).next # Example of alternate readline
419 The generator produces 5-tuples with these members: the token type; the
420 token string; a 2-tuple (srow, scol) of ints specifying the row and
421 column where the token begins in the source; a 2-tuple (erow, ecol) of
422 ints specifying the row and column where the token ends in the source;
423 and the line on which the token was found. The line passed is the
424 logical line; continuation lines are included.
426 lnum = parenlev = continued = 0
427 numchars: Final[str] = "0123456789"
428 contstr, needcont = "", 0
429 contline: Optional[str] = None
432 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
433 # `await` as keywords.
434 async_keywords = False if grammar is None else grammar.async_keywords
435 # 'stashed' and 'async_*' are used for async/await parsing
436 stashed: Optional[GoodTokenInfo] = None
441 strstart: Tuple[int, int]
442 endprog: Pattern[str]
444 while 1: # loop over lines in stream
447 except StopIteration:
450 pos, max = 0, len(line)
452 if contstr: # continued string
453 assert contline is not None
455 raise TokenError("EOF in multi-line string", strstart)
456 endmatch = endprog.match(line)
458 pos = end = endmatch.end(0)
461 contstr + line[:end],
466 contstr, needcont = "", 0
468 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
480 contstr = contstr + line
481 contline = contline + line
484 elif parenlev == 0 and not continued: # new statement
488 while pos < max: # measure leading whitespace
491 elif line[pos] == "\t":
492 column = (column // tabsize + 1) * tabsize
493 elif line[pos] == "\f":
505 if line[pos] in "\r\n": # skip blank lines
506 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
509 if line[pos] == "#": # skip comments
510 comment_token = line[pos:].rstrip("\r\n")
511 nl_pos = pos + len(comment_token)
519 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
522 if column > indents[-1]: # count indents
523 indents.append(column)
524 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
526 while column < indents[-1]: # count dedents
527 if column not in indents:
528 raise IndentationError(
529 "unindent does not match any outer indentation level",
530 ("<tokenize>", lnum, pos, line),
532 indents = indents[:-1]
534 if async_def and async_def_indent >= indents[-1]:
539 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
541 if async_def and async_def_nl and async_def_indent >= indents[-1]:
546 else: # continued statement
548 raise TokenError("EOF in multi-line statement", (lnum, 0))
552 pseudomatch = pseudoprog.match(line, pos)
553 if pseudomatch: # scan for tokens
554 start, end = pseudomatch.span(1)
555 spos, epos, pos = (lnum, start), (lnum, end), end
556 token, initial = line[start:end], line[start]
558 if initial in numchars or (
559 initial == "." and token != "."
561 yield (NUMBER, token, spos, epos, line)
562 elif initial in "\r\n":
571 yield (newline, token, spos, epos, line)
574 assert not token.endswith("\n")
578 yield (COMMENT, token, spos, epos, line)
579 elif token in triple_quoted:
580 endprog = endprogs[token]
581 endmatch = endprog.match(line, pos)
582 if endmatch: # all on one line
583 pos = endmatch.end(0)
584 token = line[start:pos]
588 yield (STRING, token, spos, (lnum, pos), line)
590 strstart = (lnum, start) # multiple lines
591 contstr = line[start:]
595 initial in single_quoted
596 or token[:2] in single_quoted
597 or token[:3] in single_quoted
599 if token[-1] == "\n": # continued string
600 strstart = (lnum, start)
602 endprogs.get(initial)
603 or endprogs.get(token[1])
604 or endprogs.get(token[2])
606 assert maybe_endprog is not None, f"endprog not found for {token}"
607 endprog = maybe_endprog
608 contstr, needcont = line[start:], 1
611 else: # ordinary string
615 yield (STRING, token, spos, epos, line)
616 elif initial.isidentifier(): # ordinary name
617 if token in ("async", "await"):
618 if async_keywords or async_def:
620 ASYNC if token == "async" else AWAIT,
628 tok = (NAME, token, spos, epos, line)
629 if token == "async" and not stashed:
633 if token in ("def", "for"):
634 if stashed and stashed[0] == NAME and stashed[1] == "async":
638 async_def_indent = indents[-1]
654 elif initial == "\\": # continued stmt
655 # This yield is new; needed for better idempotency:
659 yield (NL, token, spos, (lnum, pos), line)
664 elif initial in ")]}":
669 yield (OP, token, spos, epos, line)
671 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
678 for indent in indents[1:]: # pop remaining indent levels
679 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
680 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
683 if __name__ == "__main__": # testing
686 if len(sys.argv) > 1:
687 tokenize(open(sys.argv[1]).readline)
689 tokenize(sys.stdin.readline)