All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
52 Whitespace = r'[ \f\t]*'
53 Comment = r'#[^\r\n]*'
54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55 Name = r'[a-zA-Z_]\w*'
57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64 Expfloat = r'\d+(?:_\d+)*' + Exponent
65 Floatnumber = group(Pointfloat, Expfloat)
66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
67 Number = group(Imagnumber, Floatnumber, Intnumber)
69 # Tail end of ' string.
70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71 # Tail end of " string.
72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73 # Tail end of ''' string.
74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75 # Tail end of """ string.
76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
78 Triple = group(_litprefix + "'''", _litprefix + '"""')
79 # Single-line ' or " string.
80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
83 # Because of leftmost-then-longest match semantics, be sure to put the
84 # longest operators first (e.g., if = came before ==, == would get
85 # recognized as two instances of =).
86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
92 Special = group(r'\r?\n', r'[:;.,`@]')
93 Funny = group(Operator, Bracket, Special)
95 PlainToken = group(Number, Funny, String, Name)
96 Token = Ignore + PlainToken
98 # First (or only) line of ' or " string.
99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
100 group("'", r'\\\r?\n'),
101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
102 group('"', r'\\\r?\n'))
103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106 tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
108 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
113 "f'''": single3prog, 'f"""': double3prog,
114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
116 "rb'''": single3prog, 'rb"""': double3prog,
117 "R'''": single3prog, 'R"""': double3prog,
118 "U'''": single3prog, 'U"""': double3prog,
119 "B'''": single3prog, 'B"""': double3prog,
120 "F'''": single3prog, 'F"""': double3prog,
121 "uR'''": single3prog, 'uR"""': double3prog,
122 "Ur'''": single3prog, 'Ur"""': double3prog,
123 "UR'''": single3prog, 'UR"""': double3prog,
124 "bR'''": single3prog, 'bR"""': double3prog,
125 "Br'''": single3prog, 'Br"""': double3prog,
126 "BR'''": single3prog, 'BR"""': double3prog,
127 "rB'''": single3prog, 'rB"""': double3prog,
128 "Rb'''": single3prog, 'Rb"""': double3prog,
129 "RB'''": single3prog, 'RB"""': double3prog,
130 'r': None, 'R': None,
131 'u': None, 'U': None,
132 'f': None, 'F': None,
133 'b': None, 'B': None}
136 for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "b'''", 'b"""', "B'''", 'B"""',
140 "f'''", 'f"""', "F'''", 'F"""',
141 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
142 "uR'''", 'uR"""', "UR'''", 'UR"""',
143 "br'''", 'br"""', "Br'''", 'Br"""',
144 "bR'''", 'bR"""', "BR'''", 'BR"""',
145 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
146 "rB'''", 'rB"""', "RB'''", 'RB"""',):
150 "r'", 'r"', "R'", 'R"',
151 "u'", 'u"', "U'", 'U"',
152 "b'", 'b"', "B'", 'B"',
153 "f'", 'f"', "F'", 'F"',
154 "ur'", 'ur"', "Ur'", 'Ur"',
155 "uR'", 'uR"', "UR'", 'UR"',
156 "br'", 'br"', "Br'", 'Br"',
157 "bR'", 'bR"', "BR'", 'BR"',
158 "rb'", 'rb"', "Rb'", 'Rb"',
159 "rB'", 'rB"', "RB'", 'RB"',):
164 class TokenError(Exception): pass
166 class StopTokenizing(Exception): pass
168 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
169 (srow, scol) = xxx_todo_changeme
170 (erow, ecol) = xxx_todo_changeme1
171 print("%d,%d-%d,%d:\t%s\t%s" % \
172 (srow, scol, erow, ecol, tok_name[type], repr(token)))
174 def tokenize(readline, tokeneater=printtoken):
176 The tokenize() function accepts two parameters: one representing the
177 input stream, and one providing an output mechanism for tokenize().
179 The first parameter, readline, must be a callable object which provides
180 the same interface as the readline() method of built-in file objects.
181 Each call to the function should return one line of input as a string.
183 The second parameter, tokeneater, must also be a callable object. It is
184 called once for each token, with five arguments, corresponding to the
185 tuples generated by generate_tokens().
188 tokenize_loop(readline, tokeneater)
189 except StopTokenizing:
192 # backwards compatible interface
193 def tokenize_loop(readline, tokeneater):
194 for token_info in generate_tokens(readline):
195 tokeneater(*token_info)
204 def add_whitespace(self, start):
206 assert row <= self.prev_row
207 col_offset = col - self.prev_col
209 self.tokens.append(" " * col_offset)
211 def untokenize(self, iterable):
214 self.compat(t, iterable)
216 tok_type, token, start, end, line = t
217 self.add_whitespace(start)
218 self.tokens.append(token)
219 self.prev_row, self.prev_col = end
220 if tok_type in (NEWLINE, NL):
223 return "".join(self.tokens)
225 def compat(self, token, iterable):
228 toks_append = self.tokens.append
229 toknum, tokval = token
230 if toknum in (NAME, NUMBER):
232 if toknum in (NEWLINE, NL):
235 toknum, tokval = tok[:2]
237 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
241 indents.append(tokval)
243 elif toknum == DEDENT:
246 elif toknum in (NEWLINE, NL):
248 elif startline and indents:
249 toks_append(indents[-1])
253 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
254 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
256 def _get_normal_name(orig_enc):
257 """Imitates get_normal_name in tokenizer.c."""
258 # Only care about the first 12 characters.
259 enc = orig_enc[:12].lower().replace("_", "-")
260 if enc == "utf-8" or enc.startswith("utf-8-"):
262 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
263 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
267 def detect_encoding(readline):
269 The detect_encoding() function is used to detect the encoding that should
270 be used to decode a Python source file. It requires one argument, readline,
271 in the same way as the tokenize() generator.
273 It will call readline a maximum of twice, and return the encoding used
274 (as a string) and a list of any lines (left as bytes) it has read
277 It detects the encoding from the presence of a utf-8 bom or an encoding
278 cookie as specified in pep-0263. If both a bom and a cookie are present, but
279 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
280 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
281 'utf-8-sig' is returned.
283 If no encoding is specified, then the default of 'utf-8' will be returned.
291 except StopIteration:
294 def find_cookie(line):
296 line_string = line.decode('ascii')
297 except UnicodeDecodeError:
299 match = cookie_re.match(line_string)
302 encoding = _get_normal_name(match.group(1))
304 codec = lookup(encoding)
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError("unknown encoding: " + encoding)
310 if codec.name != 'utf-8':
311 # This behaviour mimics the Python interpreter
312 raise SyntaxError('encoding problem: utf-8')
316 first = read_or_stop()
317 if first.startswith(BOM_UTF8):
320 default = 'utf-8-sig'
324 encoding = find_cookie(first)
326 return encoding, [first]
327 if not blank_re.match(first):
328 return default, [first]
330 second = read_or_stop()
332 return default, [first]
334 encoding = find_cookie(second)
336 return encoding, [first, second]
338 return default, [first, second]
340 def untokenize(iterable):
341 """Transform tokens back into Python source code.
343 Each element returned by the iterable must be a token sequence
344 with at least two elements, a token number and token value. If
345 only two tokens are passed, the resulting output is poor.
347 Round-trip invariant for full input:
348 Untokenized source will match input source exactly
350 Round-trip invariant for limited intput:
351 # Output text will tokenize the back to the input
352 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
353 newcode = untokenize(t1)
354 readline = iter(newcode.splitlines(1)).next
355 t2 = [tok[:2] for tokin generate_tokens(readline)]
359 return ut.untokenize(iterable)
361 def generate_tokens(readline):
363 The generate_tokens() generator requires one argument, readline, which
364 must be a callable object which provides the same interface as the
365 readline() method of built-in file objects. Each call to the function
366 should return one line of input as a string. Alternately, readline
367 can be a callable function terminating with StopIteration:
368 readline = open(myfile).next # Example of alternate readline
370 The generator produces 5-tuples with these members: the token type; the
371 token string; a 2-tuple (srow, scol) of ints specifying the row and
372 column where the token begins in the source; a 2-tuple (erow, ecol) of
373 ints specifying the row and column where the token ends in the source;
374 and the line on which the token was found. The line passed is the
375 logical line; continuation lines are included.
377 lnum = parenlev = continued = 0
378 namechars, numchars = string.ascii_letters + '_', '0123456789'
379 contstr, needcont = '', 0
383 # 'stashed' and 'async_*' are used for async/await parsing
389 while 1: # loop over lines in stream
392 except StopIteration:
395 pos, max = 0, len(line)
397 if contstr: # continued string
399 raise TokenError("EOF in multi-line string", strstart)
400 endmatch = endprog.match(line)
402 pos = end = endmatch.end(0)
403 yield (STRING, contstr + line[:end],
404 strstart, (lnum, end), contline + line)
405 contstr, needcont = '', 0
407 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
408 yield (ERRORTOKEN, contstr + line,
409 strstart, (lnum, len(line)), contline)
414 contstr = contstr + line
415 contline = contline + line
418 elif parenlev == 0 and not continued: # new statement
421 while pos < max: # measure leading whitespace
422 if line[pos] == ' ': column = column + 1
423 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
424 elif line[pos] == '\f': column = 0
433 if line[pos] in '#\r\n': # skip comments or blank lines
435 comment_token = line[pos:].rstrip('\r\n')
436 nl_pos = pos + len(comment_token)
437 yield (COMMENT, comment_token,
438 (lnum, pos), (lnum, pos + len(comment_token)), line)
439 yield (NL, line[nl_pos:],
440 (lnum, nl_pos), (lnum, len(line)), line)
442 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
443 (lnum, pos), (lnum, len(line)), line)
446 if column > indents[-1]: # count indents or dedents
447 indents.append(column)
448 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
449 while column < indents[-1]:
450 if column not in indents:
451 raise IndentationError(
452 "unindent does not match any outer indentation level",
453 ("<tokenize>", lnum, pos, line))
454 indents = indents[:-1]
456 if async_def and async_def_indent >= indents[-1]:
461 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
463 if async_def and async_def_nl and async_def_indent >= indents[-1]:
468 else: # continued statement
470 raise TokenError("EOF in multi-line statement", (lnum, 0))
474 pseudomatch = pseudoprog.match(line, pos)
475 if pseudomatch: # scan for tokens
476 start, end = pseudomatch.span(1)
477 spos, epos, pos = (lnum, start), (lnum, end), end
478 token, initial = line[start:end], line[start]
480 if initial in numchars or \
481 (initial == '.' and token != '.'): # ordinary number
482 yield (NUMBER, token, spos, epos, line)
483 elif initial in '\r\n':
492 yield (newline, token, spos, epos, line)
495 assert not token.endswith("\n")
499 yield (COMMENT, token, spos, epos, line)
500 elif token in triple_quoted:
501 endprog = endprogs[token]
502 endmatch = endprog.match(line, pos)
503 if endmatch: # all on one line
504 pos = endmatch.end(0)
505 token = line[start:pos]
509 yield (STRING, token, spos, (lnum, pos), line)
511 strstart = (lnum, start) # multiple lines
512 contstr = line[start:]
515 elif initial in single_quoted or \
516 token[:2] in single_quoted or \
517 token[:3] in single_quoted:
518 if token[-1] == '\n': # continued string
519 strstart = (lnum, start)
520 endprog = (endprogs[initial] or endprogs[token[1]] or
522 contstr, needcont = line[start:], 1
525 else: # ordinary string
529 yield (STRING, token, spos, epos, line)
530 elif initial in namechars: # ordinary name
531 if token in ('async', 'await'):
533 yield (ASYNC if token == 'async' else AWAIT,
534 token, spos, epos, line)
537 tok = (NAME, token, spos, epos, line)
538 if token == 'async' and not stashed:
544 and stashed[0] == NAME
545 and stashed[1] == 'async'):
548 async_def_indent = indents[-1]
550 yield (ASYNC, stashed[1],
551 stashed[2], stashed[3],
560 elif initial == '\\': # continued stmt
561 # This yield is new; needed for better idempotency:
565 yield (NL, token, spos, (lnum, pos), line)
568 if initial in '([{': parenlev = parenlev + 1
569 elif initial in ')]}': parenlev = parenlev - 1
573 yield (OP, token, spos, epos, line)
575 yield (ERRORTOKEN, line[pos],
576 (lnum, pos), (lnum, pos+1), line)
583 for indent in indents[1:]: # pop remaining indent levels
584 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
585 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
587 if __name__ == '__main__': # testing
589 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
590 else: tokenize(sys.stdin.readline)