All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
42 from blib2to3.pgen2.token import *
43 from blib2to3.pgen2.grammar import Grammar
45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
49 from codecs import BOM_UTF8, lookup
50 from blib2to3.pgen2.token import *
54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
63 return "(" + "|".join(choices) + ")"
67 return group(*choices) + "*"
71 return group(*choices) + "?"
74 def _combinations(*l):
75 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
78 Whitespace = r"[ \f\t]*"
79 Comment = r"#[^\r\n]*"
80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
81 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
85 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
86 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
87 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
88 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
89 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
90 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
91 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
94 Expfloat = r"\d+(?:_\d+)*" + Exponent
95 Floatnumber = group(Pointfloat, Expfloat)
96 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
97 Number = group(Imagnumber, Floatnumber, Intnumber)
99 # Tail end of ' string.
100 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
101 # Tail end of " string.
102 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
103 # Tail end of ''' string.
104 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
105 # Tail end of """ string.
106 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
107 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
108 Triple = group(_litprefix + "'''", _litprefix + '"""')
109 # Single-line ' or " string.
111 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
112 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
115 # Because of leftmost-then-longest match semantics, be sure to put the
116 # longest operators first (e.g., if = came before ==, == would get
117 # recognized as two instances of =).
126 r"[+\-*/%&@|^=<>:]=?",
131 Special = group(r"\r?\n", r"[:;.,`@]")
132 Funny = group(Operator, Bracket, Special)
134 # First (or only) line of ' or " string.
136 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
137 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
139 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
140 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
142 pseudoprog = re.compile(PseudoToken, re.UNICODE)
143 single3prog = re.compile(Single3)
144 double3prog = re.compile(Double3)
147 _combinations("r", "R", "f", "F")
148 | _combinations("r", "R", "b", "B")
149 | {"u", "U", "ur", "uR", "Ur", "UR"}
153 "'": re.compile(Single),
154 '"': re.compile(Double),
157 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
158 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
159 **{prefix: None for prefix in _strprefixes},
164 | {f"{prefix}'''" for prefix in _strprefixes}
165 | {f'{prefix}"""' for prefix in _strprefixes}
169 | {f"{prefix}'" for prefix in _strprefixes}
170 | {f'{prefix}"' for prefix in _strprefixes}
176 class TokenError(Exception):
180 class StopTokenizing(Exception):
184 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
185 (srow, scol) = xxx_todo_changeme
186 (erow, ecol) = xxx_todo_changeme1
188 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
192 Coord = Tuple[int, int]
193 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
196 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
198 The tokenize() function accepts two parameters: one representing the
199 input stream, and one providing an output mechanism for tokenize().
201 The first parameter, readline, must be a callable object which provides
202 the same interface as the readline() method of built-in file objects.
203 Each call to the function should return one line of input as a string.
205 The second parameter, tokeneater, must also be a callable object. It is
206 called once for each token, with five arguments, corresponding to the
207 tuples generated by generate_tokens().
210 tokenize_loop(readline, tokeneater)
211 except StopTokenizing:
215 # backwards compatible interface
216 def tokenize_loop(readline, tokeneater):
217 for token_info in generate_tokens(readline):
218 tokeneater(*token_info)
221 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
222 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
231 def __init__(self) -> None:
236 def add_whitespace(self, start: Coord) -> None:
238 assert row <= self.prev_row
239 col_offset = col - self.prev_col
241 self.tokens.append(" " * col_offset)
243 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
246 self.compat(cast(Tuple[int, str], t), iterable)
248 tok_type, token, start, end, line = cast(
249 Tuple[int, Text, Coord, Coord, Text], t
251 self.add_whitespace(start)
252 self.tokens.append(token)
253 self.prev_row, self.prev_col = end
254 if tok_type in (NEWLINE, NL):
257 return "".join(self.tokens)
259 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
262 toks_append = self.tokens.append
263 toknum, tokval = token
264 if toknum in (NAME, NUMBER):
266 if toknum in (NEWLINE, NL):
269 toknum, tokval = tok[:2]
271 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
275 indents.append(tokval)
277 elif toknum == DEDENT:
280 elif toknum in (NEWLINE, NL):
282 elif startline and indents:
283 toks_append(indents[-1])
288 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
289 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
292 def _get_normal_name(orig_enc: str) -> str:
293 """Imitates get_normal_name in tokenizer.c."""
294 # Only care about the first 12 characters.
295 enc = orig_enc[:12].lower().replace("_", "-")
296 if enc == "utf-8" or enc.startswith("utf-8-"):
298 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
299 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
305 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
307 The detect_encoding() function is used to detect the encoding that should
308 be used to decode a Python source file. It requires one argument, readline,
309 in the same way as the tokenize() generator.
311 It will call readline a maximum of twice, and return the encoding used
312 (as a string) and a list of any lines (left as bytes) it has read
315 It detects the encoding from the presence of a utf-8 bom or an encoding
316 cookie as specified in pep-0263. If both a bom and a cookie are present, but
317 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
318 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
319 'utf-8-sig' is returned.
321 If no encoding is specified, then the default of 'utf-8' will be returned.
327 def read_or_stop() -> bytes:
330 except StopIteration:
333 def find_cookie(line: bytes) -> Optional[str]:
335 line_string = line.decode("ascii")
336 except UnicodeDecodeError:
338 match = cookie_re.match(line_string)
341 encoding = _get_normal_name(match.group(1))
343 codec = lookup(encoding)
345 # This behaviour mimics the Python interpreter
346 raise SyntaxError("unknown encoding: " + encoding)
349 if codec.name != "utf-8":
350 # This behaviour mimics the Python interpreter
351 raise SyntaxError("encoding problem: utf-8")
355 first = read_or_stop()
356 if first.startswith(BOM_UTF8):
359 default = "utf-8-sig"
363 encoding = find_cookie(first)
365 return encoding, [first]
366 if not blank_re.match(first):
367 return default, [first]
369 second = read_or_stop()
371 return default, [first]
373 encoding = find_cookie(second)
375 return encoding, [first, second]
377 return default, [first, second]
380 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
381 """Transform tokens back into Python source code.
383 Each element returned by the iterable must be a token sequence
384 with at least two elements, a token number and token value. If
385 only two tokens are passed, the resulting output is poor.
387 Round-trip invariant for full input:
388 Untokenized source will match input source exactly
390 Round-trip invariant for limited input:
391 # Output text will tokenize the back to the input
392 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
393 newcode = untokenize(t1)
394 readline = iter(newcode.splitlines(1)).next
395 t2 = [tok[:2] for tokin generate_tokens(readline)]
399 return ut.untokenize(iterable)
403 readline: Callable[[], Text], grammar: Optional[Grammar] = None
404 ) -> Iterator[GoodTokenInfo]:
406 The generate_tokens() generator requires one argument, readline, which
407 must be a callable object which provides the same interface as the
408 readline() method of built-in file objects. Each call to the function
409 should return one line of input as a string. Alternately, readline
410 can be a callable function terminating with StopIteration:
411 readline = open(myfile).next # Example of alternate readline
413 The generator produces 5-tuples with these members: the token type; the
414 token string; a 2-tuple (srow, scol) of ints specifying the row and
415 column where the token begins in the source; a 2-tuple (erow, ecol) of
416 ints specifying the row and column where the token ends in the source;
417 and the line on which the token was found. The line passed is the
418 logical line; continuation lines are included.
420 lnum = parenlev = continued = 0
421 numchars = "0123456789"
422 contstr, needcont = "", 0
423 contline: Optional[str] = None
426 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
427 # `await` as keywords.
428 async_keywords = False if grammar is None else grammar.async_keywords
429 # 'stashed' and 'async_*' are used for async/await parsing
435 strstart: Tuple[int, int]
436 endprog: Pattern[str]
438 while 1: # loop over lines in stream
441 except StopIteration:
444 pos, max = 0, len(line)
446 if contstr: # continued string
447 assert contline is not None
449 raise TokenError("EOF in multi-line string", strstart)
450 endmatch = endprog.match(line)
452 pos = end = endmatch.end(0)
455 contstr + line[:end],
460 contstr, needcont = "", 0
462 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
474 contstr = contstr + line
475 contline = contline + line
478 elif parenlev == 0 and not continued: # new statement
482 while pos < max: # measure leading whitespace
485 elif line[pos] == "\t":
486 column = (column // tabsize + 1) * tabsize
487 elif line[pos] == "\f":
499 if line[pos] in "\r\n": # skip blank lines
500 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
503 if line[pos] == "#": # skip comments
504 comment_token = line[pos:].rstrip("\r\n")
505 nl_pos = pos + len(comment_token)
510 (lnum, pos + len(comment_token)),
513 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
516 if column > indents[-1]: # count indents
517 indents.append(column)
518 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
520 while column < indents[-1]: # count dedents
521 if column not in indents:
522 raise IndentationError(
523 "unindent does not match any outer indentation level",
524 ("<tokenize>", lnum, pos, line),
526 indents = indents[:-1]
528 if async_def and async_def_indent >= indents[-1]:
533 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
535 if async_def and async_def_nl and async_def_indent >= indents[-1]:
540 else: # continued statement
542 raise TokenError("EOF in multi-line statement", (lnum, 0))
546 pseudomatch = pseudoprog.match(line, pos)
547 if pseudomatch: # scan for tokens
548 start, end = pseudomatch.span(1)
549 spos, epos, pos = (lnum, start), (lnum, end), end
550 token, initial = line[start:end], line[start]
552 if initial in numchars or (
553 initial == "." and token != "."
555 yield (NUMBER, token, spos, epos, line)
556 elif initial in "\r\n":
565 yield (newline, token, spos, epos, line)
568 assert not token.endswith("\n")
572 yield (COMMENT, token, spos, epos, line)
573 elif token in triple_quoted:
574 endprog = endprogs[token]
575 endmatch = endprog.match(line, pos)
576 if endmatch: # all on one line
577 pos = endmatch.end(0)
578 token = line[start:pos]
582 yield (STRING, token, spos, (lnum, pos), line)
584 strstart = (lnum, start) # multiple lines
585 contstr = line[start:]
589 initial in single_quoted
590 or token[:2] in single_quoted
591 or token[:3] in single_quoted
593 if token[-1] == "\n": # continued string
594 strstart = (lnum, start)
597 or endprogs[token[1]]
598 or endprogs[token[2]]
600 contstr, needcont = line[start:], 1
603 else: # ordinary string
607 yield (STRING, token, spos, epos, line)
608 elif initial.isidentifier(): # ordinary name
609 if token in ("async", "await"):
610 if async_keywords or async_def:
612 ASYNC if token == "async" else AWAIT,
620 tok = (NAME, token, spos, epos, line)
621 if token == "async" and not stashed:
625 if token in ("def", "for"):
626 if stashed and stashed[0] == NAME and stashed[1] == "async":
630 async_def_indent = indents[-1]
646 elif initial == "\\": # continued stmt
647 # This yield is new; needed for better idempotency:
651 yield (NL, token, spos, (lnum, pos), line)
655 parenlev = parenlev + 1
656 elif initial in ")]}":
657 parenlev = parenlev - 1
661 yield (OP, token, spos, epos, line)
663 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
670 for indent in indents[1:]: # pop remaining indent levels
671 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
672 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
675 if __name__ == "__main__": # testing
678 if len(sys.argv) > 1:
679 tokenize(open(sys.argv[1]).readline)
681 tokenize(sys.stdin.readline)