All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
52 Whitespace = r'[ \f\t]*'
53 Comment = r'#[^\r\n]*'
54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55 Name = r'[a-zA-Z_]\w*'
57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64 Expfloat = r'\d+(?:_\d+)*' + Exponent
65 Floatnumber = group(Pointfloat, Expfloat)
66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
67 Number = group(Imagnumber, Floatnumber, Intnumber)
69 # Tail end of ' string.
70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71 # Tail end of " string.
72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73 # Tail end of ''' string.
74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75 # Tail end of """ string.
76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
78 Triple = group(_litprefix + "'''", _litprefix + '"""')
79 # Single-line ' or " string.
80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
83 # Because of leftmost-then-longest match semantics, be sure to put the
84 # longest operators first (e.g., if = came before ==, == would get
85 # recognized as two instances of =).
86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
92 Special = group(r'\r?\n', r'[:;.,`@]')
93 Funny = group(Operator, Bracket, Special)
95 PlainToken = group(Number, Funny, String, Name)
96 Token = Ignore + PlainToken
98 # First (or only) line of ' or " string.
99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
100 group("'", r'\\\r?\n'),
101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
102 group('"', r'\\\r?\n'))
103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106 tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
108 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
113 "f'''": single3prog, 'f"""': double3prog,
114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
116 "rb'''": single3prog, 'rb"""': double3prog,
117 "R'''": single3prog, 'R"""': double3prog,
118 "U'''": single3prog, 'U"""': double3prog,
119 "B'''": single3prog, 'B"""': double3prog,
120 "F'''": single3prog, 'F"""': double3prog,
121 "uR'''": single3prog, 'uR"""': double3prog,
122 "Ur'''": single3prog, 'Ur"""': double3prog,
123 "UR'''": single3prog, 'UR"""': double3prog,
124 "bR'''": single3prog, 'bR"""': double3prog,
125 "Br'''": single3prog, 'Br"""': double3prog,
126 "BR'''": single3prog, 'BR"""': double3prog,
127 "rB'''": single3prog, 'rB"""': double3prog,
128 "Rb'''": single3prog, 'Rb"""': double3prog,
129 "RB'''": single3prog, 'RB"""': double3prog,
130 'r': None, 'R': None,
131 'u': None, 'U': None,
132 'f': None, 'F': None,
133 'b': None, 'B': None}
136 for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "b'''", 'b"""', "B'''", 'B"""',
140 "f'''", 'f"""', "F'''", 'F"""',
141 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
142 "uR'''", 'uR"""', "UR'''", 'UR"""',
143 "br'''", 'br"""', "Br'''", 'Br"""',
144 "bR'''", 'bR"""', "BR'''", 'BR"""',
145 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
146 "rB'''", 'rB"""', "RB'''", 'RB"""',):
150 "r'", 'r"', "R'", 'R"',
151 "u'", 'u"', "U'", 'U"',
152 "b'", 'b"', "B'", 'B"',
153 "f'", 'f"', "F'", 'F"',
154 "ur'", 'ur"', "Ur'", 'Ur"',
155 "uR'", 'uR"', "UR'", 'UR"',
156 "br'", 'br"', "Br'", 'Br"',
157 "bR'", 'bR"', "BR'", 'BR"',
158 "rb'", 'rb"', "Rb'", 'Rb"',
159 "rB'", 'rB"', "RB'", 'RB"',):
164 class TokenError(Exception): pass
166 class StopTokenizing(Exception): pass
168 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
169 (srow, scol) = xxx_todo_changeme
170 (erow, ecol) = xxx_todo_changeme1
171 print("%d,%d-%d,%d:\t%s\t%s" % \
172 (srow, scol, erow, ecol, tok_name[type], repr(token)))
174 def tokenize(readline, tokeneater=printtoken):
176 The tokenize() function accepts two parameters: one representing the
177 input stream, and one providing an output mechanism for tokenize().
179 The first parameter, readline, must be a callable object which provides
180 the same interface as the readline() method of built-in file objects.
181 Each call to the function should return one line of input as a string.
183 The second parameter, tokeneater, must also be a callable object. It is
184 called once for each token, with five arguments, corresponding to the
185 tuples generated by generate_tokens().
188 tokenize_loop(readline, tokeneater)
189 except StopTokenizing:
192 # backwards compatible interface
193 def tokenize_loop(readline, tokeneater):
194 for token_info in generate_tokens(readline):
195 tokeneater(*token_info)
204 def add_whitespace(self, start):
206 assert row <= self.prev_row
207 col_offset = col - self.prev_col
209 self.tokens.append(" " * col_offset)
211 def untokenize(self, iterable):
214 self.compat(t, iterable)
216 tok_type, token, start, end, line = t
217 self.add_whitespace(start)
218 self.tokens.append(token)
219 self.prev_row, self.prev_col = end
220 if tok_type in (NEWLINE, NL):
223 return "".join(self.tokens)
225 def compat(self, token, iterable):
228 toks_append = self.tokens.append
229 toknum, tokval = token
230 if toknum in (NAME, NUMBER):
232 if toknum in (NEWLINE, NL):
235 toknum, tokval = tok[:2]
237 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
241 indents.append(tokval)
243 elif toknum == DEDENT:
246 elif toknum in (NEWLINE, NL):
248 elif startline and indents:
249 toks_append(indents[-1])
253 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
254 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
256 def _get_normal_name(orig_enc):
257 """Imitates get_normal_name in tokenizer.c."""
258 # Only care about the first 12 characters.
259 enc = orig_enc[:12].lower().replace("_", "-")
260 if enc == "utf-8" or enc.startswith("utf-8-"):
262 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
263 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
267 def detect_encoding(readline):
269 The detect_encoding() function is used to detect the encoding that should
270 be used to decode a Python source file. It requires one argument, readline,
271 in the same way as the tokenize() generator.
273 It will call readline a maximum of twice, and return the encoding used
274 (as a string) and a list of any lines (left as bytes) it has read
277 It detects the encoding from the presence of a utf-8 bom or an encoding
278 cookie as specified in pep-0263. If both a bom and a cookie are present, but
279 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
280 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
281 'utf-8-sig' is returned.
283 If no encoding is specified, then the default of 'utf-8' will be returned.
291 except StopIteration:
294 def find_cookie(line):
296 line_string = line.decode('ascii')
297 except UnicodeDecodeError:
299 match = cookie_re.match(line_string)
302 encoding = _get_normal_name(match.group(1))
304 codec = lookup(encoding)
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError("unknown encoding: " + encoding)
310 if codec.name != 'utf-8':
311 # This behaviour mimics the Python interpreter
312 raise SyntaxError('encoding problem: utf-8')
316 first = read_or_stop()
317 if first.startswith(BOM_UTF8):
320 default = 'utf-8-sig'
324 encoding = find_cookie(first)
326 return encoding, [first]
327 if not blank_re.match(first):
328 return default, [first]
330 second = read_or_stop()
332 return default, [first]
334 encoding = find_cookie(second)
336 return encoding, [first, second]
338 return default, [first, second]
340 def untokenize(iterable):
341 """Transform tokens back into Python source code.
343 Each element returned by the iterable must be a token sequence
344 with at least two elements, a token number and token value. If
345 only two tokens are passed, the resulting output is poor.
347 Round-trip invariant for full input:
348 Untokenized source will match input source exactly
350 Round-trip invariant for limited intput:
351 # Output text will tokenize the back to the input
352 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
353 newcode = untokenize(t1)
354 readline = iter(newcode.splitlines(1)).next
355 t2 = [tok[:2] for tokin generate_tokens(readline)]
359 return ut.untokenize(iterable)
361 def generate_tokens(readline):
363 The generate_tokens() generator requires one argument, readline, which
364 must be a callable object which provides the same interface as the
365 readline() method of built-in file objects. Each call to the function
366 should return one line of input as a string. Alternately, readline
367 can be a callable function terminating with StopIteration:
368 readline = open(myfile).next # Example of alternate readline
370 The generator produces 5-tuples with these members: the token type; the
371 token string; a 2-tuple (srow, scol) of ints specifying the row and
372 column where the token begins in the source; a 2-tuple (erow, ecol) of
373 ints specifying the row and column where the token ends in the source;
374 and the line on which the token was found. The line passed is the
375 logical line; continuation lines are included.
377 lnum = parenlev = continued = 0
378 namechars, numchars = string.ascii_letters + '_', '0123456789'
379 contstr, needcont = '', 0
383 # 'stashed' and 'async_*' are used for async/await parsing
389 while 1: # loop over lines in stream
392 except StopIteration:
395 pos, max = 0, len(line)
397 if contstr: # continued string
399 raise TokenError("EOF in multi-line string", strstart)
400 endmatch = endprog.match(line)
402 pos = end = endmatch.end(0)
403 yield (STRING, contstr + line[:end],
404 strstart, (lnum, end), contline + line)
405 contstr, needcont = '', 0
407 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
408 yield (ERRORTOKEN, contstr + line,
409 strstart, (lnum, len(line)), contline)
414 contstr = contstr + line
415 contline = contline + line
418 elif parenlev == 0 and not continued: # new statement
421 while pos < max: # measure leading whitespace
422 if line[pos] == ' ': column = column + 1
423 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
424 elif line[pos] == '\f': column = 0
433 if line[pos] in '\r\n': # skip blank lines
434 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
437 if column > indents[-1]: # count indents
438 indents.append(column)
439 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
441 if line[pos] == '#': # skip comments
442 comment_token = line[pos:].rstrip('\r\n')
443 nl_pos = pos + len(comment_token)
444 yield (COMMENT, comment_token,
445 (lnum, pos), (lnum, pos + len(comment_token)), line)
446 yield (NL, line[nl_pos:],
447 (lnum, nl_pos), (lnum, len(line)), line)
450 while column < indents[-1]: # count dedents
451 if column not in indents:
452 raise IndentationError(
453 "unindent does not match any outer indentation level",
454 ("<tokenize>", lnum, pos, line))
455 indents = indents[:-1]
457 if async_def and async_def_indent >= indents[-1]:
462 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
464 if async_def and async_def_nl and async_def_indent >= indents[-1]:
469 else: # continued statement
471 raise TokenError("EOF in multi-line statement", (lnum, 0))
475 pseudomatch = pseudoprog.match(line, pos)
476 if pseudomatch: # scan for tokens
477 start, end = pseudomatch.span(1)
478 spos, epos, pos = (lnum, start), (lnum, end), end
479 token, initial = line[start:end], line[start]
481 if initial in numchars or \
482 (initial == '.' and token != '.'): # ordinary number
483 yield (NUMBER, token, spos, epos, line)
484 elif initial in '\r\n':
493 yield (newline, token, spos, epos, line)
496 assert not token.endswith("\n")
500 yield (COMMENT, token, spos, epos, line)
501 elif token in triple_quoted:
502 endprog = endprogs[token]
503 endmatch = endprog.match(line, pos)
504 if endmatch: # all on one line
505 pos = endmatch.end(0)
506 token = line[start:pos]
510 yield (STRING, token, spos, (lnum, pos), line)
512 strstart = (lnum, start) # multiple lines
513 contstr = line[start:]
516 elif initial in single_quoted or \
517 token[:2] in single_quoted or \
518 token[:3] in single_quoted:
519 if token[-1] == '\n': # continued string
520 strstart = (lnum, start)
521 endprog = (endprogs[initial] or endprogs[token[1]] or
523 contstr, needcont = line[start:], 1
526 else: # ordinary string
530 yield (STRING, token, spos, epos, line)
531 elif initial in namechars: # ordinary name
532 if token in ('async', 'await'):
534 yield (ASYNC if token == 'async' else AWAIT,
535 token, spos, epos, line)
538 tok = (NAME, token, spos, epos, line)
539 if token == 'async' and not stashed:
545 and stashed[0] == NAME
546 and stashed[1] == 'async'):
549 async_def_indent = indents[-1]
551 yield (ASYNC, stashed[1],
552 stashed[2], stashed[3],
561 elif initial == '\\': # continued stmt
562 # This yield is new; needed for better idempotency:
566 yield (NL, token, spos, (lnum, pos), line)
569 if initial in '([{': parenlev = parenlev + 1
570 elif initial in ')]}': parenlev = parenlev - 1
574 yield (OP, token, spos, epos, line)
576 yield (ERRORTOKEN, line[pos],
577 (lnum, pos), (lnum, pos+1), line)
584 for indent in indents[1:]: # pop remaining indent levels
585 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
586 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
588 if __name__ == '__main__': # testing
590 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
591 else: tokenize(sys.stdin.readline)