All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
51 def _combinations(*l):
53 x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
56 Whitespace = r'[ \f\t]*'
57 Comment = r'#[^\r\n]*'
58 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
59 Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups
61 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
62 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
63 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
64 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
65 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
66 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
67 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
68 Expfloat = r'\d+(?:_\d+)*' + Exponent
69 Floatnumber = group(Pointfloat, Expfloat)
70 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
71 Number = group(Imagnumber, Floatnumber, Intnumber)
73 # Tail end of ' string.
74 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
75 # Tail end of " string.
76 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
77 # Tail end of ''' string.
78 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
79 # Tail end of """ string.
80 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
81 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
82 Triple = group(_litprefix + "'''", _litprefix + '"""')
83 # Single-line ' or " string.
84 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
85 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
87 # Because of leftmost-then-longest match semantics, be sure to put the
88 # longest operators first (e.g., if = came before ==, == would get
89 # recognized as two instances of =).
90 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
96 Special = group(r'\r?\n', r'[:;.,`@]')
97 Funny = group(Operator, Bracket, Special)
99 PlainToken = group(Number, Funny, String, Name)
100 Token = Ignore + PlainToken
102 # First (or only) line of ' or " string.
103 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
104 group("'", r'\\\r?\n'),
105 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
106 group('"', r'\\\r?\n'))
107 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
108 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
110 tokenprog = re.compile(Token, re.UNICODE)
111 pseudoprog = re.compile(PseudoToken, re.UNICODE)
112 single3prog = re.compile(Single3)
113 double3prog = re.compile(Double3)
116 _combinations('r', 'R', 'f', 'F') |
117 _combinations('r', 'R', 'b', 'B') |
118 {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
121 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
122 "'''": single3prog, '"""': double3prog,
123 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
124 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
125 **{prefix: None for prefix in _strprefixes}}
129 {f"{prefix}'''" for prefix in _strprefixes} |
130 {f'{prefix}"""' for prefix in _strprefixes}
134 {f"{prefix}'" for prefix in _strprefixes} |
135 {f'{prefix}"' for prefix in _strprefixes}
140 class TokenError(Exception): pass
142 class StopTokenizing(Exception): pass
144 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
145 (srow, scol) = xxx_todo_changeme
146 (erow, ecol) = xxx_todo_changeme1
147 print("%d,%d-%d,%d:\t%s\t%s" % \
148 (srow, scol, erow, ecol, tok_name[type], repr(token)))
150 def tokenize(readline, tokeneater=printtoken):
152 The tokenize() function accepts two parameters: one representing the
153 input stream, and one providing an output mechanism for tokenize().
155 The first parameter, readline, must be a callable object which provides
156 the same interface as the readline() method of built-in file objects.
157 Each call to the function should return one line of input as a string.
159 The second parameter, tokeneater, must also be a callable object. It is
160 called once for each token, with five arguments, corresponding to the
161 tuples generated by generate_tokens().
164 tokenize_loop(readline, tokeneater)
165 except StopTokenizing:
168 # backwards compatible interface
169 def tokenize_loop(readline, tokeneater):
170 for token_info in generate_tokens(readline):
171 tokeneater(*token_info)
180 def add_whitespace(self, start):
182 assert row <= self.prev_row
183 col_offset = col - self.prev_col
185 self.tokens.append(" " * col_offset)
187 def untokenize(self, iterable):
190 self.compat(t, iterable)
192 tok_type, token, start, end, line = t
193 self.add_whitespace(start)
194 self.tokens.append(token)
195 self.prev_row, self.prev_col = end
196 if tok_type in (NEWLINE, NL):
199 return "".join(self.tokens)
201 def compat(self, token, iterable):
204 toks_append = self.tokens.append
205 toknum, tokval = token
206 if toknum in (NAME, NUMBER):
208 if toknum in (NEWLINE, NL):
211 toknum, tokval = tok[:2]
213 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
217 indents.append(tokval)
219 elif toknum == DEDENT:
222 elif toknum in (NEWLINE, NL):
224 elif startline and indents:
225 toks_append(indents[-1])
229 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
230 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
232 def _get_normal_name(orig_enc):
233 """Imitates get_normal_name in tokenizer.c."""
234 # Only care about the first 12 characters.
235 enc = orig_enc[:12].lower().replace("_", "-")
236 if enc == "utf-8" or enc.startswith("utf-8-"):
238 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
239 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
243 def detect_encoding(readline):
245 The detect_encoding() function is used to detect the encoding that should
246 be used to decode a Python source file. It requires one argument, readline,
247 in the same way as the tokenize() generator.
249 It will call readline a maximum of twice, and return the encoding used
250 (as a string) and a list of any lines (left as bytes) it has read
253 It detects the encoding from the presence of a utf-8 bom or an encoding
254 cookie as specified in pep-0263. If both a bom and a cookie are present, but
255 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
256 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
257 'utf-8-sig' is returned.
259 If no encoding is specified, then the default of 'utf-8' will be returned.
267 except StopIteration:
270 def find_cookie(line):
272 line_string = line.decode('ascii')
273 except UnicodeDecodeError:
275 match = cookie_re.match(line_string)
278 encoding = _get_normal_name(match.group(1))
280 codec = lookup(encoding)
282 # This behaviour mimics the Python interpreter
283 raise SyntaxError("unknown encoding: " + encoding)
286 if codec.name != 'utf-8':
287 # This behaviour mimics the Python interpreter
288 raise SyntaxError('encoding problem: utf-8')
292 first = read_or_stop()
293 if first.startswith(BOM_UTF8):
296 default = 'utf-8-sig'
300 encoding = find_cookie(first)
302 return encoding, [first]
303 if not blank_re.match(first):
304 return default, [first]
306 second = read_or_stop()
308 return default, [first]
310 encoding = find_cookie(second)
312 return encoding, [first, second]
314 return default, [first, second]
316 def untokenize(iterable):
317 """Transform tokens back into Python source code.
319 Each element returned by the iterable must be a token sequence
320 with at least two elements, a token number and token value. If
321 only two tokens are passed, the resulting output is poor.
323 Round-trip invariant for full input:
324 Untokenized source will match input source exactly
326 Round-trip invariant for limited intput:
327 # Output text will tokenize the back to the input
328 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
329 newcode = untokenize(t1)
330 readline = iter(newcode.splitlines(1)).next
331 t2 = [tok[:2] for tokin generate_tokens(readline)]
335 return ut.untokenize(iterable)
337 def generate_tokens(readline, grammar=None):
339 The generate_tokens() generator requires one argument, readline, which
340 must be a callable object which provides the same interface as the
341 readline() method of built-in file objects. Each call to the function
342 should return one line of input as a string. Alternately, readline
343 can be a callable function terminating with StopIteration:
344 readline = open(myfile).next # Example of alternate readline
346 The generator produces 5-tuples with these members: the token type; the
347 token string; a 2-tuple (srow, scol) of ints specifying the row and
348 column where the token begins in the source; a 2-tuple (erow, ecol) of
349 ints specifying the row and column where the token ends in the source;
350 and the line on which the token was found. The line passed is the
351 logical line; continuation lines are included.
353 lnum = parenlev = continued = 0
354 numchars = '0123456789'
355 contstr, needcont = '', 0
359 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
360 # `await` as keywords.
361 async_keywords = False if grammar is None else grammar.async_keywords
362 # 'stashed' and 'async_*' are used for async/await parsing
368 while 1: # loop over lines in stream
371 except StopIteration:
374 pos, max = 0, len(line)
376 if contstr: # continued string
378 raise TokenError("EOF in multi-line string", strstart)
379 endmatch = endprog.match(line)
381 pos = end = endmatch.end(0)
382 yield (STRING, contstr + line[:end],
383 strstart, (lnum, end), contline + line)
384 contstr, needcont = '', 0
386 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
387 yield (ERRORTOKEN, contstr + line,
388 strstart, (lnum, len(line)), contline)
393 contstr = contstr + line
394 contline = contline + line
397 elif parenlev == 0 and not continued: # new statement
400 while pos < max: # measure leading whitespace
401 if line[pos] == ' ': column = column + 1
402 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
403 elif line[pos] == '\f': column = 0
412 if line[pos] in '\r\n': # skip blank lines
413 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
416 if line[pos] == '#': # skip comments
417 comment_token = line[pos:].rstrip('\r\n')
418 nl_pos = pos + len(comment_token)
419 yield (COMMENT, comment_token,
420 (lnum, pos), (lnum, pos + len(comment_token)), line)
421 yield (NL, line[nl_pos:],
422 (lnum, nl_pos), (lnum, len(line)), line)
425 if column > indents[-1]: # count indents
426 indents.append(column)
427 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
429 while column < indents[-1]: # count dedents
430 if column not in indents:
431 raise IndentationError(
432 "unindent does not match any outer indentation level",
433 ("<tokenize>", lnum, pos, line))
434 indents = indents[:-1]
436 if async_def and async_def_indent >= indents[-1]:
441 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
443 if async_def and async_def_nl and async_def_indent >= indents[-1]:
448 else: # continued statement
450 raise TokenError("EOF in multi-line statement", (lnum, 0))
454 pseudomatch = pseudoprog.match(line, pos)
455 if pseudomatch: # scan for tokens
456 start, end = pseudomatch.span(1)
457 spos, epos, pos = (lnum, start), (lnum, end), end
458 token, initial = line[start:end], line[start]
460 if initial in numchars or \
461 (initial == '.' and token != '.'): # ordinary number
462 yield (NUMBER, token, spos, epos, line)
463 elif initial in '\r\n':
472 yield (newline, token, spos, epos, line)
475 assert not token.endswith("\n")
479 yield (COMMENT, token, spos, epos, line)
480 elif token in triple_quoted:
481 endprog = endprogs[token]
482 endmatch = endprog.match(line, pos)
483 if endmatch: # all on one line
484 pos = endmatch.end(0)
485 token = line[start:pos]
489 yield (STRING, token, spos, (lnum, pos), line)
491 strstart = (lnum, start) # multiple lines
492 contstr = line[start:]
495 elif initial in single_quoted or \
496 token[:2] in single_quoted or \
497 token[:3] in single_quoted:
498 if token[-1] == '\n': # continued string
499 strstart = (lnum, start)
500 endprog = (endprogs[initial] or endprogs[token[1]] or
502 contstr, needcont = line[start:], 1
505 else: # ordinary string
509 yield (STRING, token, spos, epos, line)
510 elif initial.isidentifier(): # ordinary name
511 if token in ('async', 'await'):
512 if async_keywords or async_def:
513 yield (ASYNC if token == 'async' else AWAIT,
514 token, spos, epos, line)
517 tok = (NAME, token, spos, epos, line)
518 if token == 'async' and not stashed:
522 if token in ('def', 'for'):
524 and stashed[0] == NAME
525 and stashed[1] == 'async'):
529 async_def_indent = indents[-1]
531 yield (ASYNC, stashed[1],
532 stashed[2], stashed[3],
541 elif initial == '\\': # continued stmt
542 # This yield is new; needed for better idempotency:
546 yield (NL, token, spos, (lnum, pos), line)
549 if initial in '([{': parenlev = parenlev + 1
550 elif initial in ')]}': parenlev = parenlev - 1
554 yield (OP, token, spos, epos, line)
556 yield (ERRORTOKEN, line[pos],
557 (lnum, pos), (lnum, pos+1), line)
564 for indent in indents[1:]: # pop remaining indent levels
565 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
566 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
568 if __name__ == '__main__': # testing
570 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
571 else: tokenize(sys.stdin.readline)