All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
32 import string, re, unicodedata
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
52 Whitespace = r'[ \f\t]*'
53 Comment = r'#[^\r\n]*'
54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64 Expfloat = r'\d+(?:_\d+)*' + Exponent
65 Floatnumber = group(Pointfloat, Expfloat)
66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
67 Number = group(Imagnumber, Floatnumber, Intnumber)
69 # Tail end of ' string.
70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71 # Tail end of " string.
72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73 # Tail end of ''' string.
74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75 # Tail end of """ string.
76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
78 Triple = group(_litprefix + "'''", _litprefix + '"""')
79 # Single-line ' or " string.
80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
83 # Because of leftmost-then-longest match semantics, be sure to put the
84 # longest operators first (e.g., if = came before ==, == would get
85 # recognized as two instances of =).
86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
92 Special = group(r'\r?\n', r'[:;.,`@]')
93 Funny = group(Operator, Bracket, Special)
95 PlainToken = group(Number, Funny, String, Name)
96 Token = Ignore + PlainToken
98 # First (or only) line of ' or " string.
99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
100 group("'", r'\\\r?\n'),
101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
102 group('"', r'\\\r?\n'))
103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106 tokenprog = re.compile(Token, re.UNICODE)
107 pseudoprog = re.compile(PseudoToken, re.UNICODE)
108 single3prog = re.compile(Single3)
109 double3prog = re.compile(Double3)
110 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
111 "'''": single3prog, '"""': double3prog,
112 "r'''": single3prog, 'r"""': double3prog,
113 "u'''": single3prog, 'u"""': double3prog,
114 "b'''": single3prog, 'b"""': double3prog,
115 "f'''": single3prog, 'f"""': double3prog,
116 "ur'''": single3prog, 'ur"""': double3prog,
117 "br'''": single3prog, 'br"""': double3prog,
118 "rb'''": single3prog, 'rb"""': double3prog,
119 "R'''": single3prog, 'R"""': double3prog,
120 "U'''": single3prog, 'U"""': double3prog,
121 "B'''": single3prog, 'B"""': double3prog,
122 "F'''": single3prog, 'F"""': double3prog,
123 "uR'''": single3prog, 'uR"""': double3prog,
124 "Ur'''": single3prog, 'Ur"""': double3prog,
125 "UR'''": single3prog, 'UR"""': double3prog,
126 "bR'''": single3prog, 'bR"""': double3prog,
127 "Br'''": single3prog, 'Br"""': double3prog,
128 "BR'''": single3prog, 'BR"""': double3prog,
129 "rB'''": single3prog, 'rB"""': double3prog,
130 "Rb'''": single3prog, 'Rb"""': double3prog,
131 "RB'''": single3prog, 'RB"""': double3prog,
132 'r': None, 'R': None,
133 'u': None, 'U': None,
134 'f': None, 'F': None,
135 'b': None, 'B': None}
138 for t in ("'''", '"""',
139 "r'''", 'r"""', "R'''", 'R"""',
140 "u'''", 'u"""', "U'''", 'U"""',
141 "b'''", 'b"""', "B'''", 'B"""',
142 "f'''", 'f"""', "F'''", 'F"""',
143 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
144 "uR'''", 'uR"""', "UR'''", 'UR"""',
145 "br'''", 'br"""', "Br'''", 'Br"""',
146 "bR'''", 'bR"""', "BR'''", 'BR"""',
147 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
148 "rB'''", 'rB"""', "RB'''", 'RB"""',):
152 "r'", 'r"', "R'", 'R"',
153 "u'", 'u"', "U'", 'U"',
154 "b'", 'b"', "B'", 'B"',
155 "f'", 'f"', "F'", 'F"',
156 "ur'", 'ur"', "Ur'", 'Ur"',
157 "uR'", 'uR"', "UR'", 'UR"',
158 "br'", 'br"', "Br'", 'Br"',
159 "bR'", 'bR"', "BR'", 'BR"',
160 "rb'", 'rb"', "Rb'", 'Rb"',
161 "rB'", 'rB"', "RB'", 'RB"',):
166 class TokenError(Exception): pass
168 class StopTokenizing(Exception): pass
170 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
171 (srow, scol) = xxx_todo_changeme
172 (erow, ecol) = xxx_todo_changeme1
173 print("%d,%d-%d,%d:\t%s\t%s" % \
174 (srow, scol, erow, ecol, tok_name[type], repr(token)))
176 def tokenize(readline, tokeneater=printtoken):
178 The tokenize() function accepts two parameters: one representing the
179 input stream, and one providing an output mechanism for tokenize().
181 The first parameter, readline, must be a callable object which provides
182 the same interface as the readline() method of built-in file objects.
183 Each call to the function should return one line of input as a string.
185 The second parameter, tokeneater, must also be a callable object. It is
186 called once for each token, with five arguments, corresponding to the
187 tuples generated by generate_tokens().
190 tokenize_loop(readline, tokeneater)
191 except StopTokenizing:
194 # backwards compatible interface
195 def tokenize_loop(readline, tokeneater):
196 for token_info in generate_tokens(readline):
197 tokeneater(*token_info)
206 def add_whitespace(self, start):
208 assert row <= self.prev_row
209 col_offset = col - self.prev_col
211 self.tokens.append(" " * col_offset)
213 def untokenize(self, iterable):
216 self.compat(t, iterable)
218 tok_type, token, start, end, line = t
219 self.add_whitespace(start)
220 self.tokens.append(token)
221 self.prev_row, self.prev_col = end
222 if tok_type in (NEWLINE, NL):
225 return "".join(self.tokens)
227 def compat(self, token, iterable):
230 toks_append = self.tokens.append
231 toknum, tokval = token
232 if toknum in (NAME, NUMBER):
234 if toknum in (NEWLINE, NL):
237 toknum, tokval = tok[:2]
239 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
243 indents.append(tokval)
245 elif toknum == DEDENT:
248 elif toknum in (NEWLINE, NL):
250 elif startline and indents:
251 toks_append(indents[-1])
255 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
256 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
258 def _get_normal_name(orig_enc):
259 """Imitates get_normal_name in tokenizer.c."""
260 # Only care about the first 12 characters.
261 enc = orig_enc[:12].lower().replace("_", "-")
262 if enc == "utf-8" or enc.startswith("utf-8-"):
264 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
265 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
269 def detect_encoding(readline):
271 The detect_encoding() function is used to detect the encoding that should
272 be used to decode a Python source file. It requires one argument, readline,
273 in the same way as the tokenize() generator.
275 It will call readline a maximum of twice, and return the encoding used
276 (as a string) and a list of any lines (left as bytes) it has read
279 It detects the encoding from the presence of a utf-8 bom or an encoding
280 cookie as specified in pep-0263. If both a bom and a cookie are present, but
281 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
282 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
283 'utf-8-sig' is returned.
285 If no encoding is specified, then the default of 'utf-8' will be returned.
293 except StopIteration:
296 def find_cookie(line):
298 line_string = line.decode('ascii')
299 except UnicodeDecodeError:
301 match = cookie_re.match(line_string)
304 encoding = _get_normal_name(match.group(1))
306 codec = lookup(encoding)
308 # This behaviour mimics the Python interpreter
309 raise SyntaxError("unknown encoding: " + encoding)
312 if codec.name != 'utf-8':
313 # This behaviour mimics the Python interpreter
314 raise SyntaxError('encoding problem: utf-8')
318 first = read_or_stop()
319 if first.startswith(BOM_UTF8):
322 default = 'utf-8-sig'
326 encoding = find_cookie(first)
328 return encoding, [first]
329 if not blank_re.match(first):
330 return default, [first]
332 second = read_or_stop()
334 return default, [first]
336 encoding = find_cookie(second)
338 return encoding, [first, second]
340 return default, [first, second]
342 def untokenize(iterable):
343 """Transform tokens back into Python source code.
345 Each element returned by the iterable must be a token sequence
346 with at least two elements, a token number and token value. If
347 only two tokens are passed, the resulting output is poor.
349 Round-trip invariant for full input:
350 Untokenized source will match input source exactly
352 Round-trip invariant for limited intput:
353 # Output text will tokenize the back to the input
354 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
355 newcode = untokenize(t1)
356 readline = iter(newcode.splitlines(1)).next
357 t2 = [tok[:2] for tokin generate_tokens(readline)]
361 return ut.untokenize(iterable)
363 InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
365 def generate_tokens(readline):
367 The generate_tokens() generator requires one argument, readline, which
368 must be a callable object which provides the same interface as the
369 readline() method of built-in file objects. Each call to the function
370 should return one line of input as a string. Alternately, readline
371 can be a callable function terminating with StopIteration:
372 readline = open(myfile).next # Example of alternate readline
374 The generator produces 5-tuples with these members: the token type; the
375 token string; a 2-tuple (srow, scol) of ints specifying the row and
376 column where the token begins in the source; a 2-tuple (erow, ecol) of
377 ints specifying the row and column where the token ends in the source;
378 and the line on which the token was found. The line passed is the
379 logical line; continuation lines are included.
381 lnum = parenlev = continued = 0
382 namechars, numchars = string.ascii_letters + '_', '0123456789'
383 contstr, needcont = '', 0
387 # 'stashed' and 'async_*' are used for async/await parsing
393 while 1: # loop over lines in stream
396 except StopIteration:
399 pos, max = 0, len(line)
401 if contstr: # continued string
403 raise TokenError("EOF in multi-line string", strstart)
404 endmatch = endprog.match(line)
406 pos = end = endmatch.end(0)
407 yield (STRING, contstr + line[:end],
408 strstart, (lnum, end), contline + line)
409 contstr, needcont = '', 0
411 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
412 yield (ERRORTOKEN, contstr + line,
413 strstart, (lnum, len(line)), contline)
418 contstr = contstr + line
419 contline = contline + line
422 elif parenlev == 0 and not continued: # new statement
425 while pos < max: # measure leading whitespace
426 if line[pos] == ' ': column = column + 1
427 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
428 elif line[pos] == '\f': column = 0
437 if line[pos] in '\r\n': # skip blank lines
438 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
441 if column > indents[-1]: # count indents
442 indents.append(column)
443 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
445 if line[pos] == '#': # skip comments
446 comment_token = line[pos:].rstrip('\r\n')
447 nl_pos = pos + len(comment_token)
448 yield (COMMENT, comment_token,
449 (lnum, pos), (lnum, pos + len(comment_token)), line)
450 yield (NL, line[nl_pos:],
451 (lnum, nl_pos), (lnum, len(line)), line)
454 while column < indents[-1]: # count dedents
455 if column not in indents:
456 raise IndentationError(
457 "unindent does not match any outer indentation level",
458 ("<tokenize>", lnum, pos, line))
459 indents = indents[:-1]
461 if async_def and async_def_indent >= indents[-1]:
466 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
468 if async_def and async_def_nl and async_def_indent >= indents[-1]:
473 else: # continued statement
475 raise TokenError("EOF in multi-line statement", (lnum, 0))
479 pseudomatch = pseudoprog.match(line, pos)
481 print('no pseudomatch')
482 if pseudomatch: # scan for tokens
483 start, end = pseudomatch.span(1)
484 spos, epos, pos = (lnum, start), (lnum, end), end
485 token, initial = line[start:end], line[start]
487 if initial in numchars or \
488 (initial == '.' and token != '.'): # ordinary number
489 yield (NUMBER, token, spos, epos, line)
490 elif initial in '\r\n':
499 yield (newline, token, spos, epos, line)
502 assert not token.endswith("\n")
506 yield (COMMENT, token, spos, epos, line)
507 elif token in triple_quoted:
508 endprog = endprogs[token]
509 endmatch = endprog.match(line, pos)
510 if endmatch: # all on one line
511 pos = endmatch.end(0)
512 token = line[start:pos]
516 yield (STRING, token, spos, (lnum, pos), line)
518 strstart = (lnum, start) # multiple lines
519 contstr = line[start:]
522 elif initial in single_quoted or \
523 token[:2] in single_quoted or \
524 token[:3] in single_quoted:
525 if token[-1] == '\n': # continued string
526 strstart = (lnum, start)
527 endprog = (endprogs[initial] or endprogs[token[1]] or
529 contstr, needcont = line[start:], 1
532 else: # ordinary string
536 yield (STRING, token, spos, epos, line)
537 elif (initial in namechars or # ordinary name
538 unicodedata.category(initial) in InitialCategories):
539 if token in ('async', 'await'):
541 yield (ASYNC if token == 'async' else AWAIT,
542 token, spos, epos, line)
545 tok = (NAME, token, spos, epos, line)
546 if token == 'async' and not stashed:
552 and stashed[0] == NAME
553 and stashed[1] == 'async'):
556 async_def_indent = indents[-1]
558 yield (ASYNC, stashed[1],
559 stashed[2], stashed[3],
568 elif initial == '\\': # continued stmt
569 # This yield is new; needed for better idempotency:
573 yield (NL, token, spos, (lnum, pos), line)
576 if initial in '([{': parenlev = parenlev + 1
577 elif initial in ')]}': parenlev = parenlev - 1
581 yield (OP, token, spos, epos, line)
583 yield (ERRORTOKEN, line[pos],
584 (lnum, pos), (lnum, pos+1), line)
591 for indent in indents[1:]: # pop remaining indent levels
592 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
593 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
595 if __name__ == '__main__': # testing
597 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
598 else: tokenize(sys.stdin.readline)