All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 # mypy: allow-untyped-defs, allow-untyped-calls
6 """Tokenization help for Python programs.
8 generate_tokens(readline) is a generator that breaks a stream of
9 text into Python tokens. It accepts a readline-like method which is called
10 repeatedly to get the next line of input (or "" for EOF). It generates
11 5-tuples with these members:
13 the token type (see token.py)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19 It is designed to match the working of the Python tokenizer exactly, except
20 that it produces COMMENT tokens for comments and gives type OP for all
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26 are the same, except instead of generating tokens, tokeneater is a callback
27 function to which the 5 fields described above are passed as 5 arguments,
28 each time a new token is found."""
44 if sys.version_info >= (3, 8):
45 from typing import Final
47 from typing_extensions import Final
49 from blib2to3.pgen2.token import *
50 from blib2to3.pgen2.grammar import Grammar
52 __author__ = "Ka-Ping Yee <ping@lfw.org>"
53 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
56 from codecs import BOM_UTF8, lookup
57 from blib2to3.pgen2.token import *
61 __all__ = [x for x in dir(token) if x[0] != "_"] + [
70 return "(" + "|".join(choices) + ")"
74 return group(*choices) + "*"
78 return group(*choices) + "?"
81 def _combinations(*l):
82 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
85 Whitespace = r"[ \f\t]*"
86 Comment = r"#[^\r\n]*"
87 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
88 Name = ( # this is invalid but it's fine because Name comes after Number in all groups
92 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
93 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
94 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
95 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
96 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
97 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
98 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
101 Expfloat = r"\d+(?:_\d+)*" + Exponent
102 Floatnumber = group(Pointfloat, Expfloat)
103 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
104 Number = group(Imagnumber, Floatnumber, Intnumber)
106 # Tail end of ' string.
107 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
108 # Tail end of " string.
109 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
110 # Tail end of ''' string.
111 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
112 # Tail end of """ string.
113 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
114 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
115 Triple = group(_litprefix + "'''", _litprefix + '"""')
116 # Single-line ' or " string.
118 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
119 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
122 # Because of leftmost-then-longest match semantics, be sure to put the
123 # longest operators first (e.g., if = came before ==, == would get
124 # recognized as two instances of =).
133 r"[+\-*/%&@|^=<>:]=?",
138 Special = group(r"\r?\n", r"[:;.,`@]")
139 Funny = group(Operator, Bracket, Special)
141 # First (or only) line of ' or " string.
143 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
144 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
146 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
147 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
149 pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
150 single3prog = re.compile(Single3)
151 double3prog = re.compile(Double3)
154 _combinations("r", "R", "f", "F")
155 | _combinations("r", "R", "b", "B")
156 | {"u", "U", "ur", "uR", "Ur", "UR"}
160 "'": re.compile(Single),
161 '"': re.compile(Double),
164 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
165 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
166 **{prefix: None for prefix in _strprefixes},
169 triple_quoted: Final = (
171 | {f"{prefix}'''" for prefix in _strprefixes}
172 | {f'{prefix}"""' for prefix in _strprefixes}
174 single_quoted: Final = (
176 | {f"{prefix}'" for prefix in _strprefixes}
177 | {f'{prefix}"' for prefix in _strprefixes}
183 class TokenError(Exception):
187 class StopTokenizing(Exception):
191 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
192 (srow, scol) = xxx_todo_changeme
193 (erow, ecol) = xxx_todo_changeme1
195 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
199 Coord = Tuple[int, int]
200 TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
203 def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
205 The tokenize() function accepts two parameters: one representing the
206 input stream, and one providing an output mechanism for tokenize().
208 The first parameter, readline, must be a callable object which provides
209 the same interface as the readline() method of built-in file objects.
210 Each call to the function should return one line of input as a string.
212 The second parameter, tokeneater, must also be a callable object. It is
213 called once for each token, with five arguments, corresponding to the
214 tuples generated by generate_tokens().
217 tokenize_loop(readline, tokeneater)
218 except StopTokenizing:
222 # backwards compatible interface
223 def tokenize_loop(readline, tokeneater):
224 for token_info in generate_tokens(readline):
225 tokeneater(*token_info)
228 GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
229 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
238 def __init__(self) -> None:
243 def add_whitespace(self, start: Coord) -> None:
245 assert row <= self.prev_row
246 col_offset = col - self.prev_col
248 self.tokens.append(" " * col_offset)
250 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
253 self.compat(cast(Tuple[int, str], t), iterable)
255 tok_type, token, start, end, line = cast(
256 Tuple[int, Text, Coord, Coord, Text], t
258 self.add_whitespace(start)
259 self.tokens.append(token)
260 self.prev_row, self.prev_col = end
261 if tok_type in (NEWLINE, NL):
264 return "".join(self.tokens)
266 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
269 toks_append = self.tokens.append
270 toknum, tokval = token
271 if toknum in (NAME, NUMBER):
273 if toknum in (NEWLINE, NL):
276 toknum, tokval = tok[:2]
278 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
282 indents.append(tokval)
284 elif toknum == DEDENT:
287 elif toknum in (NEWLINE, NL):
289 elif startline and indents:
290 toks_append(indents[-1])
295 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
296 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
299 def _get_normal_name(orig_enc: str) -> str:
300 """Imitates get_normal_name in tokenizer.c."""
301 # Only care about the first 12 characters.
302 enc = orig_enc[:12].lower().replace("_", "-")
303 if enc == "utf-8" or enc.startswith("utf-8-"):
305 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
306 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
312 def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
314 The detect_encoding() function is used to detect the encoding that should
315 be used to decode a Python source file. It requires one argument, readline,
316 in the same way as the tokenize() generator.
318 It will call readline a maximum of twice, and return the encoding used
319 (as a string) and a list of any lines (left as bytes) it has read
322 It detects the encoding from the presence of a utf-8 bom or an encoding
323 cookie as specified in pep-0263. If both a bom and a cookie are present, but
324 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
325 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
326 'utf-8-sig' is returned.
328 If no encoding is specified, then the default of 'utf-8' will be returned.
334 def read_or_stop() -> bytes:
337 except StopIteration:
340 def find_cookie(line: bytes) -> Optional[str]:
342 line_string = line.decode("ascii")
343 except UnicodeDecodeError:
345 match = cookie_re.match(line_string)
348 encoding = _get_normal_name(match.group(1))
350 codec = lookup(encoding)
352 # This behaviour mimics the Python interpreter
353 raise SyntaxError("unknown encoding: " + encoding)
356 if codec.name != "utf-8":
357 # This behaviour mimics the Python interpreter
358 raise SyntaxError("encoding problem: utf-8")
362 first = read_or_stop()
363 if first.startswith(BOM_UTF8):
366 default = "utf-8-sig"
370 encoding = find_cookie(first)
372 return encoding, [first]
373 if not blank_re.match(first):
374 return default, [first]
376 second = read_or_stop()
378 return default, [first]
380 encoding = find_cookie(second)
382 return encoding, [first, second]
384 return default, [first, second]
387 def untokenize(iterable: Iterable[TokenInfo]) -> Text:
388 """Transform tokens back into Python source code.
390 Each element returned by the iterable must be a token sequence
391 with at least two elements, a token number and token value. If
392 only two tokens are passed, the resulting output is poor.
394 Round-trip invariant for full input:
395 Untokenized source will match input source exactly
397 Round-trip invariant for limited input:
398 # Output text will tokenize the back to the input
399 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
400 newcode = untokenize(t1)
401 readline = iter(newcode.splitlines(1)).next
402 t2 = [tok[:2] for tokin generate_tokens(readline)]
406 return ut.untokenize(iterable)
410 readline: Callable[[], Text], grammar: Optional[Grammar] = None
411 ) -> Iterator[GoodTokenInfo]:
413 The generate_tokens() generator requires one argument, readline, which
414 must be a callable object which provides the same interface as the
415 readline() method of built-in file objects. Each call to the function
416 should return one line of input as a string. Alternately, readline
417 can be a callable function terminating with StopIteration:
418 readline = open(myfile).next # Example of alternate readline
420 The generator produces 5-tuples with these members: the token type; the
421 token string; a 2-tuple (srow, scol) of ints specifying the row and
422 column where the token begins in the source; a 2-tuple (erow, ecol) of
423 ints specifying the row and column where the token ends in the source;
424 and the line on which the token was found. The line passed is the
425 logical line; continuation lines are included.
427 lnum = parenlev = continued = 0
428 numchars: Final = "0123456789"
429 contstr, needcont = "", 0
430 contline: Optional[str] = None
433 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
434 # `await` as keywords.
435 async_keywords = False if grammar is None else grammar.async_keywords
436 # 'stashed' and 'async_*' are used for async/await parsing
437 stashed: Optional[GoodTokenInfo] = None
442 strstart: Tuple[int, int]
443 endprog: Pattern[str]
445 while 1: # loop over lines in stream
448 except StopIteration:
451 pos, max = 0, len(line)
453 if contstr: # continued string
454 assert contline is not None
456 raise TokenError("EOF in multi-line string", strstart)
457 endmatch = endprog.match(line)
459 pos = end = endmatch.end(0)
462 contstr + line[:end],
467 contstr, needcont = "", 0
469 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
481 contstr = contstr + line
482 contline = contline + line
485 elif parenlev == 0 and not continued: # new statement
489 while pos < max: # measure leading whitespace
492 elif line[pos] == "\t":
493 column = (column // tabsize + 1) * tabsize
494 elif line[pos] == "\f":
506 if line[pos] in "\r\n": # skip blank lines
507 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
510 if line[pos] == "#": # skip comments
511 comment_token = line[pos:].rstrip("\r\n")
512 nl_pos = pos + len(comment_token)
520 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
523 if column > indents[-1]: # count indents
524 indents.append(column)
525 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
527 while column < indents[-1]: # count dedents
528 if column not in indents:
529 raise IndentationError(
530 "unindent does not match any outer indentation level",
531 ("<tokenize>", lnum, pos, line),
533 indents = indents[:-1]
535 if async_def and async_def_indent >= indents[-1]:
540 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
542 if async_def and async_def_nl and async_def_indent >= indents[-1]:
547 else: # continued statement
549 raise TokenError("EOF in multi-line statement", (lnum, 0))
553 pseudomatch = pseudoprog.match(line, pos)
554 if pseudomatch: # scan for tokens
555 start, end = pseudomatch.span(1)
556 spos, epos, pos = (lnum, start), (lnum, end), end
557 token, initial = line[start:end], line[start]
559 if initial in numchars or (
560 initial == "." and token != "."
562 yield (NUMBER, token, spos, epos, line)
563 elif initial in "\r\n":
572 yield (newline, token, spos, epos, line)
575 assert not token.endswith("\n")
579 yield (COMMENT, token, spos, epos, line)
580 elif token in triple_quoted:
581 endprog = endprogs[token]
582 endmatch = endprog.match(line, pos)
583 if endmatch: # all on one line
584 pos = endmatch.end(0)
585 token = line[start:pos]
589 yield (STRING, token, spos, (lnum, pos), line)
591 strstart = (lnum, start) # multiple lines
592 contstr = line[start:]
596 initial in single_quoted
597 or token[:2] in single_quoted
598 or token[:3] in single_quoted
600 if token[-1] == "\n": # continued string
601 strstart = (lnum, start)
604 or endprogs[token[1]]
605 or endprogs[token[2]]
607 contstr, needcont = line[start:], 1
610 else: # ordinary string
614 yield (STRING, token, spos, epos, line)
615 elif initial.isidentifier(): # ordinary name
616 if token in ("async", "await"):
617 if async_keywords or async_def:
619 ASYNC if token == "async" else AWAIT,
627 tok = (NAME, token, spos, epos, line)
628 if token == "async" and not stashed:
632 if token in ("def", "for"):
633 if stashed and stashed[0] == NAME and stashed[1] == "async":
637 async_def_indent = indents[-1]
653 elif initial == "\\": # continued stmt
654 # This yield is new; needed for better idempotency:
658 yield (NL, token, spos, (lnum, pos), line)
663 elif initial in ")]}":
668 yield (OP, token, spos, epos, line)
670 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
677 for indent in indents[1:]: # pop remaining indent levels
678 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
679 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
682 if __name__ == "__main__": # testing
685 if len(sys.argv) > 1:
686 tokenize(open(sys.argv[1]).readline)
688 tokenize(sys.stdin.readline)