All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
42 from blib2to3.pgen2.token import *
43 from blib2to3.pgen2.grammar import Grammar
45 __author__ = "Ka-Ping Yee <ping@lfw.org>"
46 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
49 from codecs import BOM_UTF8, lookup
50 from blib2to3.pgen2.token import *
54 __all__ = [x for x in dir(token) if x[0] != "_"] + [
63 return "(" + "|".join(choices) + ")"
67 return group(*choices) + "*"
71 return group(*choices) + "?"
74 def _combinations(*l):
75 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
78 Whitespace = r"[ \f\t]*"
79 Comment = r"#[^\r\n]*"
80 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
81 Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
83 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
84 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
85 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
86 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
87 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
88 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
89 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
92 Expfloat = r"\d+(?:_\d+)*" + Exponent
93 Floatnumber = group(Pointfloat, Expfloat)
94 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
95 Number = group(Imagnumber, Floatnumber, Intnumber)
97 # Tail end of ' string.
98 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
99 # Tail end of " string.
100 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
101 # Tail end of ''' string.
102 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
103 # Tail end of """ string.
104 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
105 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
106 Triple = group(_litprefix + "'''", _litprefix + '"""')
107 # Single-line ' or " string.
109 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
110 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
113 # Because of leftmost-then-longest match semantics, be sure to put the
114 # longest operators first (e.g., if = came before ==, == would get
115 # recognized as two instances of =).
124 r"[+\-*/%&@|^=<>:]=?",
129 Special = group(r"\r?\n", r"[:;.,`@]")
130 Funny = group(Operator, Bracket, Special)
132 PlainToken = group(Number, Funny, String, Name)
133 Token = Ignore + PlainToken
135 # First (or only) line of ' or " string.
137 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
138 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
140 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
141 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
143 tokenprog = re.compile(Token, re.UNICODE)
144 pseudoprog = re.compile(PseudoToken, re.UNICODE)
145 single3prog = re.compile(Single3)
146 double3prog = re.compile(Double3)
149 _combinations("r", "R", "f", "F")
150 | _combinations("r", "R", "b", "B")
151 | {"u", "U", "ur", "uR", "Ur", "UR"}
155 "'": re.compile(Single),
156 '"': re.compile(Double),
159 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
160 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
161 **{prefix: None for prefix in _strprefixes},
166 | {f"{prefix}'''" for prefix in _strprefixes}
167 | {f'{prefix}"""' for prefix in _strprefixes}
171 | {f"{prefix}'" for prefix in _strprefixes}
172 | {f'{prefix}"' for prefix in _strprefixes}
178 class TokenError(Exception):
182 class StopTokenizing(Exception):
186 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
187 (srow, scol) = xxx_todo_changeme
188 (erow, ecol) = xxx_todo_changeme1
190 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
194 Coord = Tuple[int, int]
195 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
198 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
200 The tokenize() function accepts two parameters: one representing the
201 input stream, and one providing an output mechanism for tokenize().
203 The first parameter, readline, must be a callable object which provides
204 the same interface as the readline() method of built-in file objects.
205 Each call to the function should return one line of input as a string.
207 The second parameter, tokeneater, must also be a callable object. It is
208 called once for each token, with five arguments, corresponding to the
209 tuples generated by generate_tokens().
212 tokenize_loop(readline, tokeneater)
213 except StopTokenizing:
217 # backwards compatible interface
218 def tokenize_loop(readline, tokeneater):
219 for token_info in generate_tokens(readline):
220 tokeneater(*token_info)
223 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
224 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
233 def __init__(self) -> None:
238 def add_whitespace(self, start: Coord) -> None:
240 assert row <= self.prev_row
241 col_offset = col - self.prev_col
243 self.tokens.append(" " * col_offset)
245 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
248 self.compat(cast(Tuple[int, str], t), iterable)
250 tok_type, token, start, end, line = cast(
251 Tuple[int, Text, Coord, Coord, Text], t
253 self.add_whitespace(start)
254 self.tokens.append(token)
255 self.prev_row, self.prev_col = end
256 if tok_type in (NEWLINE, NL):
259 return "".join(self.tokens)
261 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
264 toks_append = self.tokens.append
265 toknum, tokval = token
266 if toknum in (NAME, NUMBER):
268 if toknum in (NEWLINE, NL):
271 toknum, tokval = tok[:2]
273 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
277 indents.append(tokval)
279 elif toknum == DEDENT:
282 elif toknum in (NEWLINE, NL):
284 elif startline and indents:
285 toks_append(indents[-1])
290 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
291 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
294 def _get_normal_name(orig_enc: str) -> str:
295 """Imitates get_normal_name in tokenizer.c."""
296 # Only care about the first 12 characters.
297 enc = orig_enc[:12].lower().replace("_", "-")
298 if enc == "utf-8" or enc.startswith("utf-8-"):
300 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
301 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
307 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
309 The detect_encoding() function is used to detect the encoding that should
310 be used to decode a Python source file. It requires one argument, readline,
311 in the same way as the tokenize() generator.
313 It will call readline a maximum of twice, and return the encoding used
314 (as a string) and a list of any lines (left as bytes) it has read
317 It detects the encoding from the presence of a utf-8 bom or an encoding
318 cookie as specified in pep-0263. If both a bom and a cookie are present, but
319 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
320 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
321 'utf-8-sig' is returned.
323 If no encoding is specified, then the default of 'utf-8' will be returned.
329 def read_or_stop() -> bytes:
332 except StopIteration:
335 def find_cookie(line: bytes) -> Optional[str]:
337 line_string = line.decode("ascii")
338 except UnicodeDecodeError:
340 match = cookie_re.match(line_string)
343 encoding = _get_normal_name(match.group(1))
345 codec = lookup(encoding)
347 # This behaviour mimics the Python interpreter
348 raise SyntaxError("unknown encoding: " + encoding)
351 if codec.name != "utf-8":
352 # This behaviour mimics the Python interpreter
353 raise SyntaxError("encoding problem: utf-8")
357 first = read_or_stop()
358 if first.startswith(BOM_UTF8):
361 default = "utf-8-sig"
365 encoding = find_cookie(first)
367 return encoding, [first]
368 if not blank_re.match(first):
369 return default, [first]
371 second = read_or_stop()
373 return default, [first]
375 encoding = find_cookie(second)
377 return encoding, [first, second]
379 return default, [first, second]
382 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
383 """Transform tokens back into Python source code.
385 Each element returned by the iterable must be a token sequence
386 with at least two elements, a token number and token value. If
387 only two tokens are passed, the resulting output is poor.
389 Round-trip invariant for full input:
390 Untokenized source will match input source exactly
392 Round-trip invariant for limited intput:
393 # Output text will tokenize the back to the input
394 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
395 newcode = untokenize(t1)
396 readline = iter(newcode.splitlines(1)).next
397 t2 = [tok[:2] for tokin generate_tokens(readline)]
401 return ut.untokenize(iterable)
405 readline: Callable[[], Text], grammar: Optional[Grammar] = None
406 ) -> Iterator[GoodTokenInfo]:
408 The generate_tokens() generator requires one argument, readline, which
409 must be a callable object which provides the same interface as the
410 readline() method of built-in file objects. Each call to the function
411 should return one line of input as a string. Alternately, readline
412 can be a callable function terminating with StopIteration:
413 readline = open(myfile).next # Example of alternate readline
415 The generator produces 5-tuples with these members: the token type; the
416 token string; a 2-tuple (srow, scol) of ints specifying the row and
417 column where the token begins in the source; a 2-tuple (erow, ecol) of
418 ints specifying the row and column where the token ends in the source;
419 and the line on which the token was found. The line passed is the
420 logical line; continuation lines are included.
422 lnum = parenlev = continued = 0
423 numchars = "0123456789"
424 contstr, needcont = "", 0
425 contline: Optional[str] = None
428 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
429 # `await` as keywords.
430 async_keywords = False if grammar is None else grammar.async_keywords
431 # 'stashed' and 'async_*' are used for async/await parsing
437 strstart: Tuple[int, int]
438 endprog: Pattern[str]
440 while 1: # loop over lines in stream
443 except StopIteration:
446 pos, max = 0, len(line)
448 if contstr: # continued string
449 assert contline is not None
451 raise TokenError("EOF in multi-line string", strstart)
452 endmatch = endprog.match(line)
454 pos = end = endmatch.end(0)
457 contstr + line[:end],
462 contstr, needcont = "", 0
464 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
476 contstr = contstr + line
477 contline = contline + line
480 elif parenlev == 0 and not continued: # new statement
484 while pos < max: # measure leading whitespace
487 elif line[pos] == "\t":
488 column = (column // tabsize + 1) * tabsize
489 elif line[pos] == "\f":
501 if line[pos] in "\r\n": # skip blank lines
502 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
505 if line[pos] == "#": # skip comments
506 comment_token = line[pos:].rstrip("\r\n")
507 nl_pos = pos + len(comment_token)
512 (lnum, pos + len(comment_token)),
515 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
518 if column > indents[-1]: # count indents
519 indents.append(column)
520 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
522 while column < indents[-1]: # count dedents
523 if column not in indents:
524 raise IndentationError(
525 "unindent does not match any outer indentation level",
526 ("<tokenize>", lnum, pos, line),
528 indents = indents[:-1]
530 if async_def and async_def_indent >= indents[-1]:
535 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
537 if async_def and async_def_nl and async_def_indent >= indents[-1]:
542 else: # continued statement
544 raise TokenError("EOF in multi-line statement", (lnum, 0))
548 pseudomatch = pseudoprog.match(line, pos)
549 if pseudomatch: # scan for tokens
550 start, end = pseudomatch.span(1)
551 spos, epos, pos = (lnum, start), (lnum, end), end
552 token, initial = line[start:end], line[start]
554 if initial in numchars or (
555 initial == "." and token != "."
557 yield (NUMBER, token, spos, epos, line)
558 elif initial in "\r\n":
567 yield (newline, token, spos, epos, line)
570 assert not token.endswith("\n")
574 yield (COMMENT, token, spos, epos, line)
575 elif token in triple_quoted:
576 endprog = endprogs[token]
577 endmatch = endprog.match(line, pos)
578 if endmatch: # all on one line
579 pos = endmatch.end(0)
580 token = line[start:pos]
584 yield (STRING, token, spos, (lnum, pos), line)
586 strstart = (lnum, start) # multiple lines
587 contstr = line[start:]
591 initial in single_quoted
592 or token[:2] in single_quoted
593 or token[:3] in single_quoted
595 if token[-1] == "\n": # continued string
596 strstart = (lnum, start)
599 or endprogs[token[1]]
600 or endprogs[token[2]]
602 contstr, needcont = line[start:], 1
605 else: # ordinary string
609 yield (STRING, token, spos, epos, line)
610 elif initial.isidentifier(): # ordinary name
611 if token in ("async", "await"):
612 if async_keywords or async_def:
614 ASYNC if token == "async" else AWAIT,
622 tok = (NAME, token, spos, epos, line)
623 if token == "async" and not stashed:
627 if token in ("def", "for"):
628 if stashed and stashed[0] == NAME and stashed[1] == "async":
632 async_def_indent = indents[-1]
648 elif initial == "\\": # continued stmt
649 # This yield is new; needed for better idempotency:
653 yield (NL, token, spos, (lnum, pos), line)
657 parenlev = parenlev + 1
658 elif initial in ")]}":
659 parenlev = parenlev - 1
663 yield (OP, token, spos, epos, line)
665 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
672 for indent in indents[1:]: # pop remaining indent levels
673 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
674 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
677 if __name__ == "__main__": # testing
680 if len(sys.argv) > 1:
681 tokenize(open(sys.argv[1]).readline)
683 tokenize(sys.stdin.readline)