All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 from codecs import BOM_UTF8, lookup
34 from lib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
52 Whitespace = r'[ \f\t]*'
53 Comment = r'#[^\r\n]*'
54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55 Name = r'[a-zA-Z_]\w*'
57 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64 Expfloat = r'\d+(?:_\d+)*' + Exponent
65 Floatnumber = group(Pointfloat, Expfloat)
66 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
67 Number = group(Imagnumber, Floatnumber, Intnumber)
69 # Tail end of ' string.
70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71 # Tail end of " string.
72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73 # Tail end of ''' string.
74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75 # Tail end of """ string.
76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77 _litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
78 Triple = group(_litprefix + "'''", _litprefix + '"""')
79 # Single-line ' or " string.
80 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
83 # Because of leftmost-then-longest match semantics, be sure to put the
84 # longest operators first (e.g., if = came before ==, == would get
85 # recognized as two instances of =).
86 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
92 Special = group(r'\r?\n', r'[:;.,`@]')
93 Funny = group(Operator, Bracket, Special)
95 PlainToken = group(Number, Funny, String, Name)
96 Token = Ignore + PlainToken
98 # First (or only) line of ' or " string.
99 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
100 group("'", r'\\\r?\n'),
101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
102 group('"', r'\\\r?\n'))
103 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106 tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
108 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
113 "f'''": single3prog, 'f"""': double3prog,
114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
116 "rb'''": single3prog, 'rb"""': double3prog,
117 "R'''": single3prog, 'R"""': double3prog,
118 "U'''": single3prog, 'U"""': double3prog,
119 "B'''": single3prog, 'B"""': double3prog,
120 "F'''": single3prog, 'F"""': double3prog,
121 "uR'''": single3prog, 'uR"""': double3prog,
122 "Ur'''": single3prog, 'Ur"""': double3prog,
123 "UR'''": single3prog, 'UR"""': double3prog,
124 "bR'''": single3prog, 'bR"""': double3prog,
125 "Br'''": single3prog, 'Br"""': double3prog,
126 "BR'''": single3prog, 'BR"""': double3prog,
127 "rB'''": single3prog, 'rB"""': double3prog,
128 "Rb'''": single3prog, 'Rb"""': double3prog,
129 "RB'''": single3prog, 'RB"""': double3prog,
130 'r': None, 'R': None,
131 'u': None, 'U': None,
132 'f': None, 'F': None,
133 'b': None, 'B': None}
136 for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "b'''", 'b"""', "B'''", 'B"""',
140 "f'''", 'f"""', "F'''", 'F"""',
141 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
142 "uR'''", 'uR"""', "UR'''", 'UR"""',
143 "br'''", 'br"""', "Br'''", 'Br"""',
144 "bR'''", 'bR"""', "BR'''", 'BR"""',
145 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
146 "rB'''", 'rB"""', "RB'''", 'RB"""',):
150 "r'", 'r"', "R'", 'R"',
151 "u'", 'u"', "U'", 'U"',
152 "b'", 'b"', "B'", 'B"',
153 "f'", 'f"', "F'", 'F"',
154 "ur'", 'ur"', "Ur'", 'Ur"',
155 "uR'", 'uR"', "UR'", 'UR"',
156 "br'", 'br"', "Br'", 'Br"',
157 "bR'", 'bR"', "BR'", 'BR"',
158 "rb'", 'rb"', "Rb'", 'Rb"',
159 "rB'", 'rB"', "RB'", 'RB"',):
164 class TokenError(Exception): pass
166 class StopTokenizing(Exception): pass
168 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
169 (srow, scol) = xxx_todo_changeme
170 (erow, ecol) = xxx_todo_changeme1
171 print("%d,%d-%d,%d:\t%s\t%s" % \
172 (srow, scol, erow, ecol, tok_name[type], repr(token)))
174 def tokenize(readline, tokeneater=printtoken):
176 The tokenize() function accepts two parameters: one representing the
177 input stream, and one providing an output mechanism for tokenize().
179 The first parameter, readline, must be a callable object which provides
180 the same interface as the readline() method of built-in file objects.
181 Each call to the function should return one line of input as a string.
183 The second parameter, tokeneater, must also be a callable object. It is
184 called once for each token, with five arguments, corresponding to the
185 tuples generated by generate_tokens().
188 tokenize_loop(readline, tokeneater)
189 except StopTokenizing:
192 # backwards compatible interface
193 def tokenize_loop(readline, tokeneater):
194 for token_info in generate_tokens(readline):
195 tokeneater(*token_info)
204 def add_whitespace(self, start):
206 assert row <= self.prev_row
207 col_offset = col - self.prev_col
209 self.tokens.append(" " * col_offset)
211 def untokenize(self, iterable):
214 self.compat(t, iterable)
216 tok_type, token, start, end, line = t
217 self.add_whitespace(start)
218 self.tokens.append(token)
219 self.prev_row, self.prev_col = end
220 if tok_type in (NEWLINE, NL):
223 return "".join(self.tokens)
225 def compat(self, token, iterable):
228 toks_append = self.tokens.append
229 toknum, tokval = token
230 if toknum in (NAME, NUMBER):
232 if toknum in (NEWLINE, NL):
235 toknum, tokval = tok[:2]
237 if toknum in (NAME, NUMBER):
241 indents.append(tokval)
243 elif toknum == DEDENT:
246 elif toknum in (NEWLINE, NL):
248 elif startline and indents:
249 toks_append(indents[-1])
253 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
254 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
256 def _get_normal_name(orig_enc):
257 """Imitates get_normal_name in tokenizer.c."""
258 # Only care about the first 12 characters.
259 enc = orig_enc[:12].lower().replace("_", "-")
260 if enc == "utf-8" or enc.startswith("utf-8-"):
262 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
263 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
267 def detect_encoding(readline):
269 The detect_encoding() function is used to detect the encoding that should
270 be used to decode a Python source file. It requires one argument, readline,
271 in the same way as the tokenize() generator.
273 It will call readline a maximum of twice, and return the encoding used
274 (as a string) and a list of any lines (left as bytes) it has read
277 It detects the encoding from the presence of a utf-8 bom or an encoding
278 cookie as specified in pep-0263. If both a bom and a cookie are present, but
279 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
280 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
281 'utf-8-sig' is returned.
283 If no encoding is specified, then the default of 'utf-8' will be returned.
291 except StopIteration:
294 def find_cookie(line):
296 line_string = line.decode('ascii')
297 except UnicodeDecodeError:
299 match = cookie_re.match(line_string)
302 encoding = _get_normal_name(match.group(1))
304 codec = lookup(encoding)
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError("unknown encoding: " + encoding)
310 if codec.name != 'utf-8':
311 # This behaviour mimics the Python interpreter
312 raise SyntaxError('encoding problem: utf-8')
316 first = read_or_stop()
317 if first.startswith(BOM_UTF8):
320 default = 'utf-8-sig'
324 encoding = find_cookie(first)
326 return encoding, [first]
327 if not blank_re.match(first):
328 return default, [first]
330 second = read_or_stop()
332 return default, [first]
334 encoding = find_cookie(second)
336 return encoding, [first, second]
338 return default, [first, second]
340 def untokenize(iterable):
341 """Transform tokens back into Python source code.
343 Each element returned by the iterable must be a token sequence
344 with at least two elements, a token number and token value. If
345 only two tokens are passed, the resulting output is poor.
347 Round-trip invariant for full input:
348 Untokenized source will match input source exactly
350 Round-trip invariant for limited intput:
351 # Output text will tokenize the back to the input
352 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
353 newcode = untokenize(t1)
354 readline = iter(newcode.splitlines(1)).next
355 t2 = [tok[:2] for tokin generate_tokens(readline)]
359 return ut.untokenize(iterable)
361 def generate_tokens(readline):
363 The generate_tokens() generator requires one argument, readline, which
364 must be a callable object which provides the same interface as the
365 readline() method of built-in file objects. Each call to the function
366 should return one line of input as a string. Alternately, readline
367 can be a callable function terminating with StopIteration:
368 readline = open(myfile).next # Example of alternate readline
370 The generator produces 5-tuples with these members: the token type; the
371 token string; a 2-tuple (srow, scol) of ints specifying the row and
372 column where the token begins in the source; a 2-tuple (erow, ecol) of
373 ints specifying the row and column where the token ends in the source;
374 and the line on which the token was found. The line passed is the
375 logical line; continuation lines are included.
377 lnum = parenlev = continued = 0
378 namechars, numchars = string.ascii_letters + '_', '0123456789'
379 contstr, needcont = '', 0
383 while 1: # loop over lines in stream
386 except StopIteration:
389 pos, max = 0, len(line)
391 if contstr: # continued string
393 raise TokenError("EOF in multi-line string", strstart)
394 endmatch = endprog.match(line)
396 pos = end = endmatch.end(0)
397 yield (STRING, contstr + line[:end],
398 strstart, (lnum, end), contline + line)
399 contstr, needcont = '', 0
401 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
402 yield (ERRORTOKEN, contstr + line,
403 strstart, (lnum, len(line)), contline)
408 contstr = contstr + line
409 contline = contline + line
412 elif parenlev == 0 and not continued: # new statement
415 while pos < max: # measure leading whitespace
416 if line[pos] == ' ': column = column + 1
417 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
418 elif line[pos] == '\f': column = 0
423 if line[pos] in '#\r\n': # skip comments or blank lines
425 comment_token = line[pos:].rstrip('\r\n')
426 nl_pos = pos + len(comment_token)
427 yield (COMMENT, comment_token,
428 (lnum, pos), (lnum, pos + len(comment_token)), line)
429 yield (NL, line[nl_pos:],
430 (lnum, nl_pos), (lnum, len(line)), line)
432 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
433 (lnum, pos), (lnum, len(line)), line)
436 if column > indents[-1]: # count indents or dedents
437 indents.append(column)
438 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
439 while column < indents[-1]:
440 if column not in indents:
441 raise IndentationError(
442 "unindent does not match any outer indentation level",
443 ("<tokenize>", lnum, pos, line))
444 indents = indents[:-1]
446 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
448 else: # continued statement
450 raise TokenError("EOF in multi-line statement", (lnum, 0))
454 pseudomatch = pseudoprog.match(line, pos)
455 if pseudomatch: # scan for tokens
456 start, end = pseudomatch.span(1)
457 spos, epos, pos = (lnum, start), (lnum, end), end
458 token, initial = line[start:end], line[start]
460 if initial in numchars or \
461 (initial == '.' and token != '.'): # ordinary number
462 yield (NUMBER, token, spos, epos, line)
463 elif initial in '\r\n':
467 yield (newline, token, spos, epos, line)
470 assert not token.endswith("\n")
471 yield (COMMENT, token, spos, epos, line)
472 elif token in triple_quoted:
473 endprog = endprogs[token]
474 endmatch = endprog.match(line, pos)
475 if endmatch: # all on one line
476 pos = endmatch.end(0)
477 token = line[start:pos]
478 yield (STRING, token, spos, (lnum, pos), line)
480 strstart = (lnum, start) # multiple lines
481 contstr = line[start:]
484 elif initial in single_quoted or \
485 token[:2] in single_quoted or \
486 token[:3] in single_quoted:
487 if token[-1] == '\n': # continued string
488 strstart = (lnum, start)
489 endprog = (endprogs[initial] or endprogs[token[1]] or
491 contstr, needcont = line[start:], 1
494 else: # ordinary string
495 yield (STRING, token, spos, epos, line)
496 elif initial in namechars: # ordinary name
497 yield (NAME, token, spos, epos, line)
498 elif initial == '\\': # continued stmt
499 # This yield is new; needed for better idempotency:
500 yield (NL, token, spos, (lnum, pos), line)
503 if initial in '([{': parenlev = parenlev + 1
504 elif initial in ')]}': parenlev = parenlev - 1
505 yield (OP, token, spos, epos, line)
507 yield (ERRORTOKEN, line[pos],
508 (lnum, pos), (lnum, pos+1), line)
511 for indent in indents[1:]: # pop remaining indent levels
512 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
513 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
515 if __name__ == '__main__': # testing
517 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
518 else: tokenize(sys.stdin.readline)