All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = "Ka-Ping Yee <ping@lfw.org>"
29 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
32 from codecs import BOM_UTF8, lookup
33 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != "_"] + [
47 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
48 # valid Python 3 code.
53 return "(" + "|".join(choices) + ")"
57 return group(*choices) + "*"
61 return group(*choices) + "?"
64 def _combinations(*l):
65 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
68 Whitespace = r"[ \f\t]*"
69 Comment = r"#[^\r\n]*"
70 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
71 Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
73 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
74 Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
75 Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
76 Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
77 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
78 Exponent = r"[eE][-+]?\d+(?:_\d+)*"
79 Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
82 Expfloat = r"\d+(?:_\d+)*" + Exponent
83 Floatnumber = group(Pointfloat, Expfloat)
84 Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
85 Number = group(Imagnumber, Floatnumber, Intnumber)
87 # Tail end of ' string.
88 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
89 # Tail end of " string.
90 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
91 # Tail end of ''' string.
92 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
93 # Tail end of """ string.
94 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
95 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
96 Triple = group(_litprefix + "'''", _litprefix + '"""')
97 # Single-line ' or " string.
99 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
100 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
103 # Because of leftmost-then-longest match semantics, be sure to put the
104 # longest operators first (e.g., if = came before ==, == would get
105 # recognized as two instances of =).
114 r"[+\-*/%&@|^=<>:]=?",
119 Special = group(r"\r?\n", r"[:;.,`@]")
120 Funny = group(Operator, Bracket, Special)
122 PlainToken = group(Number, Funny, String, Name)
123 Token = Ignore + PlainToken
125 # First (or only) line of ' or " string.
127 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
128 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
130 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
131 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
133 tokenprog = re.compile(Token, re.UNICODE)
134 pseudoprog = re.compile(PseudoToken, re.UNICODE)
135 single3prog = re.compile(Single3)
136 double3prog = re.compile(Double3)
139 _combinations("r", "R", "f", "F")
140 | _combinations("r", "R", "b", "B")
141 | {"u", "U", "ur", "uR", "Ur", "UR"}
145 "'": re.compile(Single),
146 '"': re.compile(Double),
149 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
150 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
151 **{prefix: None for prefix in _strprefixes},
156 | {f"{prefix}'''" for prefix in _strprefixes}
157 | {f'{prefix}"""' for prefix in _strprefixes}
161 | {f"{prefix}'" for prefix in _strprefixes}
162 | {f'{prefix}"' for prefix in _strprefixes}
168 class TokenError(Exception):
172 class StopTokenizing(Exception):
176 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
177 (srow, scol) = xxx_todo_changeme
178 (erow, ecol) = xxx_todo_changeme1
180 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
184 def tokenize(readline, tokeneater=printtoken):
186 The tokenize() function accepts two parameters: one representing the
187 input stream, and one providing an output mechanism for tokenize().
189 The first parameter, readline, must be a callable object which provides
190 the same interface as the readline() method of built-in file objects.
191 Each call to the function should return one line of input as a string.
193 The second parameter, tokeneater, must also be a callable object. It is
194 called once for each token, with five arguments, corresponding to the
195 tuples generated by generate_tokens().
198 tokenize_loop(readline, tokeneater)
199 except StopTokenizing:
203 # backwards compatible interface
204 def tokenize_loop(readline, tokeneater):
205 for token_info in generate_tokens(readline):
206 tokeneater(*token_info)
215 def add_whitespace(self, start):
217 assert row <= self.prev_row
218 col_offset = col - self.prev_col
220 self.tokens.append(" " * col_offset)
222 def untokenize(self, iterable):
225 self.compat(t, iterable)
227 tok_type, token, start, end, line = t
228 self.add_whitespace(start)
229 self.tokens.append(token)
230 self.prev_row, self.prev_col = end
231 if tok_type in (NEWLINE, NL):
234 return "".join(self.tokens)
236 def compat(self, token, iterable):
239 toks_append = self.tokens.append
240 toknum, tokval = token
241 if toknum in (NAME, NUMBER):
243 if toknum in (NEWLINE, NL):
246 toknum, tokval = tok[:2]
248 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
252 indents.append(tokval)
254 elif toknum == DEDENT:
257 elif toknum in (NEWLINE, NL):
259 elif startline and indents:
260 toks_append(indents[-1])
265 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
266 blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
269 def _get_normal_name(orig_enc):
270 """Imitates get_normal_name in tokenizer.c."""
271 # Only care about the first 12 characters.
272 enc = orig_enc[:12].lower().replace("_", "-")
273 if enc == "utf-8" or enc.startswith("utf-8-"):
275 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
276 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
282 def detect_encoding(readline):
284 The detect_encoding() function is used to detect the encoding that should
285 be used to decode a Python source file. It requires one argument, readline,
286 in the same way as the tokenize() generator.
288 It will call readline a maximum of twice, and return the encoding used
289 (as a string) and a list of any lines (left as bytes) it has read
292 It detects the encoding from the presence of a utf-8 bom or an encoding
293 cookie as specified in pep-0263. If both a bom and a cookie are present, but
294 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
295 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
296 'utf-8-sig' is returned.
298 If no encoding is specified, then the default of 'utf-8' will be returned.
307 except StopIteration:
310 def find_cookie(line):
312 line_string = line.decode("ascii")
313 except UnicodeDecodeError:
315 match = cookie_re.match(line_string)
318 encoding = _get_normal_name(match.group(1))
320 codec = lookup(encoding)
322 # This behaviour mimics the Python interpreter
323 raise SyntaxError("unknown encoding: " + encoding)
326 if codec.name != "utf-8":
327 # This behaviour mimics the Python interpreter
328 raise SyntaxError("encoding problem: utf-8")
332 first = read_or_stop()
333 if first.startswith(BOM_UTF8):
336 default = "utf-8-sig"
340 encoding = find_cookie(first)
342 return encoding, [first]
343 if not blank_re.match(first):
344 return default, [first]
346 second = read_or_stop()
348 return default, [first]
350 encoding = find_cookie(second)
352 return encoding, [first, second]
354 return default, [first, second]
357 def untokenize(iterable):
358 """Transform tokens back into Python source code.
360 Each element returned by the iterable must be a token sequence
361 with at least two elements, a token number and token value. If
362 only two tokens are passed, the resulting output is poor.
364 Round-trip invariant for full input:
365 Untokenized source will match input source exactly
367 Round-trip invariant for limited intput:
368 # Output text will tokenize the back to the input
369 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
370 newcode = untokenize(t1)
371 readline = iter(newcode.splitlines(1)).next
372 t2 = [tok[:2] for tokin generate_tokens(readline)]
376 return ut.untokenize(iterable)
379 def generate_tokens(readline, grammar=None):
381 The generate_tokens() generator requires one argument, readline, which
382 must be a callable object which provides the same interface as the
383 readline() method of built-in file objects. Each call to the function
384 should return one line of input as a string. Alternately, readline
385 can be a callable function terminating with StopIteration:
386 readline = open(myfile).next # Example of alternate readline
388 The generator produces 5-tuples with these members: the token type; the
389 token string; a 2-tuple (srow, scol) of ints specifying the row and
390 column where the token begins in the source; a 2-tuple (erow, ecol) of
391 ints specifying the row and column where the token ends in the source;
392 and the line on which the token was found. The line passed is the
393 logical line; continuation lines are included.
395 lnum = parenlev = continued = 0
396 numchars = "0123456789"
397 contstr, needcont = "", 0
401 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
402 # `await` as keywords.
403 async_keywords = False if grammar is None else grammar.async_keywords
404 # 'stashed' and 'async_*' are used for async/await parsing
410 while 1: # loop over lines in stream
413 except StopIteration:
416 pos, max = 0, len(line)
418 if contstr: # continued string
420 raise TokenError("EOF in multi-line string", strstart)
421 endmatch = endprog.match(line)
423 pos = end = endmatch.end(0)
426 contstr + line[:end],
431 contstr, needcont = "", 0
433 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
445 contstr = contstr + line
446 contline = contline + line
449 elif parenlev == 0 and not continued: # new statement
453 while pos < max: # measure leading whitespace
456 elif line[pos] == "\t":
457 column = (column // tabsize + 1) * tabsize
458 elif line[pos] == "\f":
470 if line[pos] in "\r\n": # skip blank lines
471 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
474 if line[pos] == "#": # skip comments
475 comment_token = line[pos:].rstrip("\r\n")
476 nl_pos = pos + len(comment_token)
481 (lnum, pos + len(comment_token)),
484 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
487 if column > indents[-1]: # count indents
488 indents.append(column)
489 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
491 while column < indents[-1]: # count dedents
492 if column not in indents:
493 raise IndentationError(
494 "unindent does not match any outer indentation level",
495 ("<tokenize>", lnum, pos, line),
497 indents = indents[:-1]
499 if async_def and async_def_indent >= indents[-1]:
504 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
506 if async_def and async_def_nl and async_def_indent >= indents[-1]:
511 else: # continued statement
513 raise TokenError("EOF in multi-line statement", (lnum, 0))
517 pseudomatch = pseudoprog.match(line, pos)
518 if pseudomatch: # scan for tokens
519 start, end = pseudomatch.span(1)
520 spos, epos, pos = (lnum, start), (lnum, end), end
521 token, initial = line[start:end], line[start]
523 if initial in numchars or (
524 initial == "." and token != "."
526 yield (NUMBER, token, spos, epos, line)
527 elif initial in "\r\n":
536 yield (newline, token, spos, epos, line)
539 assert not token.endswith("\n")
543 yield (COMMENT, token, spos, epos, line)
544 elif token in triple_quoted:
545 endprog = endprogs[token]
546 endmatch = endprog.match(line, pos)
547 if endmatch: # all on one line
548 pos = endmatch.end(0)
549 token = line[start:pos]
553 yield (STRING, token, spos, (lnum, pos), line)
555 strstart = (lnum, start) # multiple lines
556 contstr = line[start:]
560 initial in single_quoted
561 or token[:2] in single_quoted
562 or token[:3] in single_quoted
564 if token[-1] == "\n": # continued string
565 strstart = (lnum, start)
568 or endprogs[token[1]]
569 or endprogs[token[2]]
571 contstr, needcont = line[start:], 1
574 else: # ordinary string
578 yield (STRING, token, spos, epos, line)
579 elif initial.isidentifier(): # ordinary name
580 if token in ("async", "await"):
581 if async_keywords or async_def:
583 ASYNC if token == "async" else AWAIT,
591 tok = (NAME, token, spos, epos, line)
592 if token == "async" and not stashed:
596 if token in ("def", "for"):
597 if stashed and stashed[0] == NAME and stashed[1] == "async":
601 async_def_indent = indents[-1]
617 elif initial == "\\": # continued stmt
618 # This yield is new; needed for better idempotency:
622 yield (NL, token, spos, (lnum, pos), line)
626 parenlev = parenlev + 1
627 elif initial in ")]}":
628 parenlev = parenlev - 1
632 yield (OP, token, spos, epos, line)
634 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
641 for indent in indents[1:]: # pop remaining indent levels
642 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
643 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
646 if __name__ == "__main__": # testing
649 if len(sys.argv) > 1:
650 tokenize(open(sys.argv[1]).readline)
652 tokenize(sys.stdin.readline)