All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
42 from blib2to3.pgen2.token import *
43 from blib2to3.pgen2.grammar import Grammar
45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
49 from codecs import BOM_UTF8, lookup
50 from blib2to3.pgen2.token import *
54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
63 return "(" + "|".join(choices) + ")"
67 return group(*choices) + "*"
71 return group(*choices) + "?"
74 def _combinations(*l):
75 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
78 Whitespace = r"[ \f\t]*"
79 Comment = r"#[^\r\n]*"
80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
81 Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
83 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
84 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
85 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
86 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
87 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
88 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
89 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
92 Expfloat = r"\d+(?:_\d+)*" + Exponent
93 Floatnumber = group(Pointfloat, Expfloat)
94 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
95 Number = group(Imagnumber, Floatnumber, Intnumber)
97 # Tail end of ' string.
98 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
99 # Tail end of " string.
100 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
101 # Tail end of ''' string.
102 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
103 # Tail end of """ string.
104 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
105 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
106 Triple = group(_litprefix + "'''", _litprefix + '"""')
107 # Single-line ' or " string.
109 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
110 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
113 # Because of leftmost-then-longest match semantics, be sure to put the
114 # longest operators first (e.g., if = came before ==, == would get
115 # recognized as two instances of =).
124 r"[+\-*/%&@|^=<>:]=?",
129 Special = group(r"\r?\n", r"[:;.,`@]")
130 Funny = group(Operator, Bracket, Special)
132 # First (or only) line of ' or " string.
134 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
135 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
137 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
138 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
140 pseudoprog = re.compile(PseudoToken, re.UNICODE)
141 single3prog = re.compile(Single3)
142 double3prog = re.compile(Double3)
145 _combinations("r", "R", "f", "F")
146 | _combinations("r", "R", "b", "B")
147 | {"u", "U", "ur", "uR", "Ur", "UR"}
151 "'": re.compile(Single),
152 '"': re.compile(Double),
155 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
156 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
157 **{prefix: None for prefix in _strprefixes},
162 | {f"{prefix}'''" for prefix in _strprefixes}
163 | {f'{prefix}"""' for prefix in _strprefixes}
167 | {f"{prefix}'" for prefix in _strprefixes}
168 | {f'{prefix}"' for prefix in _strprefixes}
174 class TokenError(Exception):
178 class StopTokenizing(Exception):
182 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
183 (srow, scol) = xxx_todo_changeme
184 (erow, ecol) = xxx_todo_changeme1
186 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
190 Coord = Tuple[int, int]
191 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
194 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
196 The tokenize() function accepts two parameters: one representing the
197 input stream, and one providing an output mechanism for tokenize().
199 The first parameter, readline, must be a callable object which provides
200 the same interface as the readline() method of built-in file objects.
201 Each call to the function should return one line of input as a string.
203 The second parameter, tokeneater, must also be a callable object. It is
204 called once for each token, with five arguments, corresponding to the
205 tuples generated by generate_tokens().
208 tokenize_loop(readline, tokeneater)
209 except StopTokenizing:
213 # backwards compatible interface
214 def tokenize_loop(readline, tokeneater):
215 for token_info in generate_tokens(readline):
216 tokeneater(*token_info)
219 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
220 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
229 def __init__(self) -> None:
234 def add_whitespace(self, start: Coord) -> None:
236 assert row <= self.prev_row
237 col_offset = col - self.prev_col
239 self.tokens.append(" " * col_offset)
241 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
244 self.compat(cast(Tuple[int, str], t), iterable)
246 tok_type, token, start, end, line = cast(
247 Tuple[int, Text, Coord, Coord, Text], t
249 self.add_whitespace(start)
250 self.tokens.append(token)
251 self.prev_row, self.prev_col = end
252 if tok_type in (NEWLINE, NL):
255 return "".join(self.tokens)
257 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
260 toks_append = self.tokens.append
261 toknum, tokval = token
262 if toknum in (NAME, NUMBER):
264 if toknum in (NEWLINE, NL):
267 toknum, tokval = tok[:2]
269 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
273 indents.append(tokval)
275 elif toknum == DEDENT:
278 elif toknum in (NEWLINE, NL):
280 elif startline and indents:
281 toks_append(indents[-1])
286 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
287 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
290 def _get_normal_name(orig_enc: str) -> str:
291 """Imitates get_normal_name in tokenizer.c."""
292 # Only care about the first 12 characters.
293 enc = orig_enc[:12].lower().replace("_", "-")
294 if enc == "utf-8" or enc.startswith("utf-8-"):
296 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
297 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
303 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
305 The detect_encoding() function is used to detect the encoding that should
306 be used to decode a Python source file. It requires one argument, readline,
307 in the same way as the tokenize() generator.
309 It will call readline a maximum of twice, and return the encoding used
310 (as a string) and a list of any lines (left as bytes) it has read
313 It detects the encoding from the presence of a utf-8 bom or an encoding
314 cookie as specified in pep-0263. If both a bom and a cookie are present, but
315 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
316 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
317 'utf-8-sig' is returned.
319 If no encoding is specified, then the default of 'utf-8' will be returned.
325 def read_or_stop() -> bytes:
328 except StopIteration:
331 def find_cookie(line: bytes) -> Optional[str]:
333 line_string = line.decode("ascii")
334 except UnicodeDecodeError:
336 match = cookie_re.match(line_string)
339 encoding = _get_normal_name(match.group(1))
341 codec = lookup(encoding)
343 # This behaviour mimics the Python interpreter
344 raise SyntaxError("unknown encoding: " + encoding)
347 if codec.name != "utf-8":
348 # This behaviour mimics the Python interpreter
349 raise SyntaxError("encoding problem: utf-8")
353 first = read_or_stop()
354 if first.startswith(BOM_UTF8):
357 default = "utf-8-sig"
361 encoding = find_cookie(first)
363 return encoding, [first]
364 if not blank_re.match(first):
365 return default, [first]
367 second = read_or_stop()
369 return default, [first]
371 encoding = find_cookie(second)
373 return encoding, [first, second]
375 return default, [first, second]
378 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
379 """Transform tokens back into Python source code.
381 Each element returned by the iterable must be a token sequence
382 with at least two elements, a token number and token value. If
383 only two tokens are passed, the resulting output is poor.
385 Round-trip invariant for full input:
386 Untokenized source will match input source exactly
388 Round-trip invariant for limited input:
389 # Output text will tokenize the back to the input
390 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
391 newcode = untokenize(t1)
392 readline = iter(newcode.splitlines(1)).next
393 t2 = [tok[:2] for tokin generate_tokens(readline)]
397 return ut.untokenize(iterable)
401 readline: Callable[[], Text], grammar: Optional[Grammar] = None
402 ) -> Iterator[GoodTokenInfo]:
404 The generate_tokens() generator requires one argument, readline, which
405 must be a callable object which provides the same interface as the
406 readline() method of built-in file objects. Each call to the function
407 should return one line of input as a string. Alternately, readline
408 can be a callable function terminating with StopIteration:
409 readline = open(myfile).next # Example of alternate readline
411 The generator produces 5-tuples with these members: the token type; the
412 token string; a 2-tuple (srow, scol) of ints specifying the row and
413 column where the token begins in the source; a 2-tuple (erow, ecol) of
414 ints specifying the row and column where the token ends in the source;
415 and the line on which the token was found. The line passed is the
416 logical line; continuation lines are included.
418 lnum = parenlev = continued = 0
419 numchars = "0123456789"
420 contstr, needcont = "", 0
421 contline: Optional[str] = None
424 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
425 # `await` as keywords.
426 async_keywords = False if grammar is None else grammar.async_keywords
427 # 'stashed' and 'async_*' are used for async/await parsing
433 strstart: Tuple[int, int]
434 endprog: Pattern[str]
436 while 1: # loop over lines in stream
439 except StopIteration:
442 pos, max = 0, len(line)
444 if contstr: # continued string
445 assert contline is not None
447 raise TokenError("EOF in multi-line string", strstart)
448 endmatch = endprog.match(line)
450 pos = end = endmatch.end(0)
453 contstr + line[:end],
458 contstr, needcont = "", 0
460 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
472 contstr = contstr + line
473 contline = contline + line
476 elif parenlev == 0 and not continued: # new statement
480 while pos < max: # measure leading whitespace
483 elif line[pos] == "\t":
484 column = (column // tabsize + 1) * tabsize
485 elif line[pos] == "\f":
497 if line[pos] in "\r\n": # skip blank lines
498 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
501 if line[pos] == "#": # skip comments
502 comment_token = line[pos:].rstrip("\r\n")
503 nl_pos = pos + len(comment_token)
508 (lnum, pos + len(comment_token)),
511 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
514 if column > indents[-1]: # count indents
515 indents.append(column)
516 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
518 while column < indents[-1]: # count dedents
519 if column not in indents:
520 raise IndentationError(
521 "unindent does not match any outer indentation level",
522 ("<tokenize>", lnum, pos, line),
524 indents = indents[:-1]
526 if async_def and async_def_indent >= indents[-1]:
531 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
533 if async_def and async_def_nl and async_def_indent >= indents[-1]:
538 else: # continued statement
540 raise TokenError("EOF in multi-line statement", (lnum, 0))
544 pseudomatch = pseudoprog.match(line, pos)
545 if pseudomatch: # scan for tokens
546 start, end = pseudomatch.span(1)
547 spos, epos, pos = (lnum, start), (lnum, end), end
548 token, initial = line[start:end], line[start]
550 if initial in numchars or (
551 initial == "." and token != "."
553 yield (NUMBER, token, spos, epos, line)
554 elif initial in "\r\n":
563 yield (newline, token, spos, epos, line)
566 assert not token.endswith("\n")
570 yield (COMMENT, token, spos, epos, line)
571 elif token in triple_quoted:
572 endprog = endprogs[token]
573 endmatch = endprog.match(line, pos)
574 if endmatch: # all on one line
575 pos = endmatch.end(0)
576 token = line[start:pos]
580 yield (STRING, token, spos, (lnum, pos), line)
582 strstart = (lnum, start) # multiple lines
583 contstr = line[start:]
587 initial in single_quoted
588 or token[:2] in single_quoted
589 or token[:3] in single_quoted
591 if token[-1] == "\n": # continued string
592 strstart = (lnum, start)
595 or endprogs[token[1]]
596 or endprogs[token[2]]
598 contstr, needcont = line[start:], 1
601 else: # ordinary string
605 yield (STRING, token, spos, epos, line)
606 elif initial.isidentifier(): # ordinary name
607 if token in ("async", "await"):
608 if async_keywords or async_def:
610 ASYNC if token == "async" else AWAIT,
618 tok = (NAME, token, spos, epos, line)
619 if token == "async" and not stashed:
623 if token in ("def", "for"):
624 if stashed and stashed[0] == NAME and stashed[1] == "async":
628 async_def_indent = indents[-1]
644 elif initial == "\\": # continued stmt
645 # This yield is new; needed for better idempotency:
649 yield (NL, token, spos, (lnum, pos), line)
653 parenlev = parenlev + 1
654 elif initial in ")]}":
655 parenlev = parenlev - 1
659 yield (OP, token, spos, epos, line)
661 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
668 for indent in indents[1:]: # pop remaining indent levels
669 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
670 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
673 if __name__ == "__main__": # testing
676 if len(sys.argv) > 1:
677 tokenize(open(sys.argv[1]).readline)
679 tokenize(sys.stdin.readline)