All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
32 import string, re, unicodedata
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
51 def _combinations(*l):
53 x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
56 Whitespace = r'[ \f\t]*'
57 Comment = r'#[^\r\n]*'
58 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
61 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
62 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
63 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
64 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
65 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
66 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
67 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
68 Expfloat = r'\d+(?:_\d+)*' + Exponent
69 Floatnumber = group(Pointfloat, Expfloat)
70 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
71 Number = group(Imagnumber, Floatnumber, Intnumber)
73 # Tail end of ' string.
74 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
75 # Tail end of " string.
76 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
77 # Tail end of ''' string.
78 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
79 # Tail end of """ string.
80 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
81 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
82 Triple = group(_litprefix + "'''", _litprefix + '"""')
83 # Single-line ' or " string.
84 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
85 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
87 # Because of leftmost-then-longest match semantics, be sure to put the
88 # longest operators first (e.g., if = came before ==, == would get
89 # recognized as two instances of =).
90 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
96 Special = group(r'\r?\n', r'[:;.,`@]')
97 Funny = group(Operator, Bracket, Special)
99 PlainToken = group(Number, Funny, String, Name)
100 Token = Ignore + PlainToken
102 # First (or only) line of ' or " string.
103 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
104 group("'", r'\\\r?\n'),
105 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
106 group('"', r'\\\r?\n'))
107 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
108 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
110 tokenprog = re.compile(Token, re.UNICODE)
111 pseudoprog = re.compile(PseudoToken, re.UNICODE)
112 single3prog = re.compile(Single3)
113 double3prog = re.compile(Double3)
116 _combinations('r', 'R', 'f', 'F') |
117 _combinations('r', 'R', 'b', 'B') |
118 {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
121 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
122 "'''": single3prog, '"""': double3prog,
123 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
124 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
125 **{prefix: None for prefix in _strprefixes}}
129 {f"{prefix}'''" for prefix in _strprefixes} |
130 {f'{prefix}"""' for prefix in _strprefixes}
134 {f"{prefix}'" for prefix in _strprefixes} |
135 {f'{prefix}"' for prefix in _strprefixes}
140 class TokenError(Exception): pass
142 class StopTokenizing(Exception): pass
144 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
145 (srow, scol) = xxx_todo_changeme
146 (erow, ecol) = xxx_todo_changeme1
147 print("%d,%d-%d,%d:\t%s\t%s" % \
148 (srow, scol, erow, ecol, tok_name[type], repr(token)))
150 def tokenize(readline, tokeneater=printtoken):
152 The tokenize() function accepts two parameters: one representing the
153 input stream, and one providing an output mechanism for tokenize().
155 The first parameter, readline, must be a callable object which provides
156 the same interface as the readline() method of built-in file objects.
157 Each call to the function should return one line of input as a string.
159 The second parameter, tokeneater, must also be a callable object. It is
160 called once for each token, with five arguments, corresponding to the
161 tuples generated by generate_tokens().
164 tokenize_loop(readline, tokeneater)
165 except StopTokenizing:
168 # backwards compatible interface
169 def tokenize_loop(readline, tokeneater):
170 for token_info in generate_tokens(readline):
171 tokeneater(*token_info)
180 def add_whitespace(self, start):
182 assert row <= self.prev_row
183 col_offset = col - self.prev_col
185 self.tokens.append(" " * col_offset)
187 def untokenize(self, iterable):
190 self.compat(t, iterable)
192 tok_type, token, start, end, line = t
193 self.add_whitespace(start)
194 self.tokens.append(token)
195 self.prev_row, self.prev_col = end
196 if tok_type in (NEWLINE, NL):
199 return "".join(self.tokens)
201 def compat(self, token, iterable):
204 toks_append = self.tokens.append
205 toknum, tokval = token
206 if toknum in (NAME, NUMBER):
208 if toknum in (NEWLINE, NL):
211 toknum, tokval = tok[:2]
213 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
217 indents.append(tokval)
219 elif toknum == DEDENT:
222 elif toknum in (NEWLINE, NL):
224 elif startline and indents:
225 toks_append(indents[-1])
229 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
230 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
232 def _get_normal_name(orig_enc):
233 """Imitates get_normal_name in tokenizer.c."""
234 # Only care about the first 12 characters.
235 enc = orig_enc[:12].lower().replace("_", "-")
236 if enc == "utf-8" or enc.startswith("utf-8-"):
238 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
239 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
243 def detect_encoding(readline):
245 The detect_encoding() function is used to detect the encoding that should
246 be used to decode a Python source file. It requires one argument, readline,
247 in the same way as the tokenize() generator.
249 It will call readline a maximum of twice, and return the encoding used
250 (as a string) and a list of any lines (left as bytes) it has read
253 It detects the encoding from the presence of a utf-8 bom or an encoding
254 cookie as specified in pep-0263. If both a bom and a cookie are present, but
255 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
256 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
257 'utf-8-sig' is returned.
259 If no encoding is specified, then the default of 'utf-8' will be returned.
267 except StopIteration:
270 def find_cookie(line):
272 line_string = line.decode('ascii')
273 except UnicodeDecodeError:
275 match = cookie_re.match(line_string)
278 encoding = _get_normal_name(match.group(1))
280 codec = lookup(encoding)
282 # This behaviour mimics the Python interpreter
283 raise SyntaxError("unknown encoding: " + encoding)
286 if codec.name != 'utf-8':
287 # This behaviour mimics the Python interpreter
288 raise SyntaxError('encoding problem: utf-8')
292 first = read_or_stop()
293 if first.startswith(BOM_UTF8):
296 default = 'utf-8-sig'
300 encoding = find_cookie(first)
302 return encoding, [first]
303 if not blank_re.match(first):
304 return default, [first]
306 second = read_or_stop()
308 return default, [first]
310 encoding = find_cookie(second)
312 return encoding, [first, second]
314 return default, [first, second]
316 def untokenize(iterable):
317 """Transform tokens back into Python source code.
319 Each element returned by the iterable must be a token sequence
320 with at least two elements, a token number and token value. If
321 only two tokens are passed, the resulting output is poor.
323 Round-trip invariant for full input:
324 Untokenized source will match input source exactly
326 Round-trip invariant for limited intput:
327 # Output text will tokenize the back to the input
328 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
329 newcode = untokenize(t1)
330 readline = iter(newcode.splitlines(1)).next
331 t2 = [tok[:2] for tokin generate_tokens(readline)]
335 return ut.untokenize(iterable)
337 InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
339 def generate_tokens(readline):
341 The generate_tokens() generator requires one argument, readline, which
342 must be a callable object which provides the same interface as the
343 readline() method of built-in file objects. Each call to the function
344 should return one line of input as a string. Alternately, readline
345 can be a callable function terminating with StopIteration:
346 readline = open(myfile).next # Example of alternate readline
348 The generator produces 5-tuples with these members: the token type; the
349 token string; a 2-tuple (srow, scol) of ints specifying the row and
350 column where the token begins in the source; a 2-tuple (erow, ecol) of
351 ints specifying the row and column where the token ends in the source;
352 and the line on which the token was found. The line passed is the
353 logical line; continuation lines are included.
355 lnum = parenlev = continued = 0
356 namechars, numchars = string.ascii_letters + '_', '0123456789'
357 contstr, needcont = '', 0
361 # 'stashed' and 'async_*' are used for async/await parsing
367 while 1: # loop over lines in stream
370 except StopIteration:
373 pos, max = 0, len(line)
375 if contstr: # continued string
377 raise TokenError("EOF in multi-line string", strstart)
378 endmatch = endprog.match(line)
380 pos = end = endmatch.end(0)
381 yield (STRING, contstr + line[:end],
382 strstart, (lnum, end), contline + line)
383 contstr, needcont = '', 0
385 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
386 yield (ERRORTOKEN, contstr + line,
387 strstart, (lnum, len(line)), contline)
392 contstr = contstr + line
393 contline = contline + line
396 elif parenlev == 0 and not continued: # new statement
399 while pos < max: # measure leading whitespace
400 if line[pos] == ' ': column = column + 1
401 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
402 elif line[pos] == '\f': column = 0
411 if line[pos] in '\r\n': # skip blank lines
412 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
415 if line[pos] == '#': # skip comments
416 comment_token = line[pos:].rstrip('\r\n')
417 nl_pos = pos + len(comment_token)
418 yield (COMMENT, comment_token,
419 (lnum, pos), (lnum, pos + len(comment_token)), line)
420 yield (NL, line[nl_pos:],
421 (lnum, nl_pos), (lnum, len(line)), line)
424 if column > indents[-1]: # count indents
425 indents.append(column)
426 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
428 while column < indents[-1]: # count dedents
429 if column not in indents:
430 raise IndentationError(
431 "unindent does not match any outer indentation level",
432 ("<tokenize>", lnum, pos, line))
433 indents = indents[:-1]
435 if async_def and async_def_indent >= indents[-1]:
440 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
442 if async_def and async_def_nl and async_def_indent >= indents[-1]:
447 else: # continued statement
449 raise TokenError("EOF in multi-line statement", (lnum, 0))
453 pseudomatch = pseudoprog.match(line, pos)
454 if pseudomatch: # scan for tokens
455 start, end = pseudomatch.span(1)
456 spos, epos, pos = (lnum, start), (lnum, end), end
457 token, initial = line[start:end], line[start]
459 if initial in numchars or \
460 (initial == '.' and token != '.'): # ordinary number
461 yield (NUMBER, token, spos, epos, line)
462 elif initial in '\r\n':
471 yield (newline, token, spos, epos, line)
474 assert not token.endswith("\n")
478 yield (COMMENT, token, spos, epos, line)
479 elif token in triple_quoted:
480 endprog = endprogs[token]
481 endmatch = endprog.match(line, pos)
482 if endmatch: # all on one line
483 pos = endmatch.end(0)
484 token = line[start:pos]
488 yield (STRING, token, spos, (lnum, pos), line)
490 strstart = (lnum, start) # multiple lines
491 contstr = line[start:]
494 elif initial in single_quoted or \
495 token[:2] in single_quoted or \
496 token[:3] in single_quoted:
497 if token[-1] == '\n': # continued string
498 strstart = (lnum, start)
499 endprog = (endprogs[initial] or endprogs[token[1]] or
501 contstr, needcont = line[start:], 1
504 else: # ordinary string
508 yield (STRING, token, spos, epos, line)
509 elif (initial in namechars or # ordinary name
510 unicodedata.category(initial) in InitialCategories):
511 if token in ('async', 'await'):
513 yield (ASYNC if token == 'async' else AWAIT,
514 token, spos, epos, line)
517 tok = (NAME, token, spos, epos, line)
518 if token == 'async' and not stashed:
524 and stashed[0] == NAME
525 and stashed[1] == 'async'):
528 async_def_indent = indents[-1]
530 yield (ASYNC, stashed[1],
531 stashed[2], stashed[3],
540 elif initial == '\\': # continued stmt
541 # This yield is new; needed for better idempotency:
545 yield (NL, token, spos, (lnum, pos), line)
548 if initial in '([{': parenlev = parenlev + 1
549 elif initial in ')]}': parenlev = parenlev - 1
553 yield (OP, token, spos, epos, line)
555 yield (ERRORTOKEN, line[pos],
556 (lnum, pos), (lnum, pos+1), line)
563 for indent in indents[1:]: # pop remaining indent levels
564 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
565 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
567 if __name__ == '__main__': # testing
569 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
570 else: tokenize(sys.stdin.readline)