All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4 """Tokenization help for Python programs.
6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members:
11 the token type (see token.py)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24 are the same, except instead of generating tokens, tokeneater is a callback
25 function to which the 5 fields described above are passed as 5 arguments,
26 each time a new token is found."""
28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 from codecs import BOM_UTF8, lookup
34 from blib2to3.pgen2.token import *
37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
48 def group(*choices): return '(' + '|'.join(choices) + ')'
49 def any(*choices): return group(*choices) + '*'
50 def maybe(*choices): return group(*choices) + '?'
51 def _combinations(*l):
53 x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
56 Whitespace = r'[ \f\t]*'
57 Comment = r'#[^\r\n]*'
58 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
59 Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups
61 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
62 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
63 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
64 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
65 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
66 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
67 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
68 Expfloat = r'\d+(?:_\d+)*' + Exponent
69 Floatnumber = group(Pointfloat, Expfloat)
70 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
71 Number = group(Imagnumber, Floatnumber, Intnumber)
73 # Tail end of ' string.
74 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
75 # Tail end of " string.
76 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
77 # Tail end of ''' string.
78 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
79 # Tail end of """ string.
80 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
81 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
82 Triple = group(_litprefix + "'''", _litprefix + '"""')
83 # Single-line ' or " string.
84 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
85 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
87 # Because of leftmost-then-longest match semantics, be sure to put the
88 # longest operators first (e.g., if = came before ==, == would get
89 # recognized as two instances of =).
90 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
96 Special = group(r'\r?\n', r'[:;.,`@]')
97 Funny = group(Operator, Bracket, Special)
99 PlainToken = group(Number, Funny, String, Name)
100 Token = Ignore + PlainToken
102 # First (or only) line of ' or " string.
103 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
104 group("'", r'\\\r?\n'),
105 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
106 group('"', r'\\\r?\n'))
107 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
108 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
110 tokenprog = re.compile(Token, re.UNICODE)
111 pseudoprog = re.compile(PseudoToken, re.UNICODE)
112 single3prog = re.compile(Single3)
113 double3prog = re.compile(Double3)
116 _combinations('r', 'R', 'f', 'F') |
117 _combinations('r', 'R', 'b', 'B') |
118 {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
121 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
122 "'''": single3prog, '"""': double3prog,
123 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
124 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
125 **{prefix: None for prefix in _strprefixes}}
129 {f"{prefix}'''" for prefix in _strprefixes} |
130 {f'{prefix}"""' for prefix in _strprefixes}
134 {f"{prefix}'" for prefix in _strprefixes} |
135 {f'{prefix}"' for prefix in _strprefixes}
140 class TokenError(Exception): pass
142 class StopTokenizing(Exception): pass
144 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
145 (srow, scol) = xxx_todo_changeme
146 (erow, ecol) = xxx_todo_changeme1
147 print("%d,%d-%d,%d:\t%s\t%s" % \
148 (srow, scol, erow, ecol, tok_name[type], repr(token)))
150 def tokenize(readline, tokeneater=printtoken):
152 The tokenize() function accepts two parameters: one representing the
153 input stream, and one providing an output mechanism for tokenize().
155 The first parameter, readline, must be a callable object which provides
156 the same interface as the readline() method of built-in file objects.
157 Each call to the function should return one line of input as a string.
159 The second parameter, tokeneater, must also be a callable object. It is
160 called once for each token, with five arguments, corresponding to the
161 tuples generated by generate_tokens().
164 tokenize_loop(readline, tokeneater)
165 except StopTokenizing:
168 # backwards compatible interface
169 def tokenize_loop(readline, tokeneater):
170 for token_info in generate_tokens(readline):
171 tokeneater(*token_info)
180 def add_whitespace(self, start):
182 assert row <= self.prev_row
183 col_offset = col - self.prev_col
185 self.tokens.append(" " * col_offset)
187 def untokenize(self, iterable):
190 self.compat(t, iterable)
192 tok_type, token, start, end, line = t
193 self.add_whitespace(start)
194 self.tokens.append(token)
195 self.prev_row, self.prev_col = end
196 if tok_type in (NEWLINE, NL):
199 return "".join(self.tokens)
201 def compat(self, token, iterable):
204 toks_append = self.tokens.append
205 toknum, tokval = token
206 if toknum in (NAME, NUMBER):
208 if toknum in (NEWLINE, NL):
211 toknum, tokval = tok[:2]
213 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
217 indents.append(tokval)
219 elif toknum == DEDENT:
222 elif toknum in (NEWLINE, NL):
224 elif startline and indents:
225 toks_append(indents[-1])
229 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
230 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
232 def _get_normal_name(orig_enc):
233 """Imitates get_normal_name in tokenizer.c."""
234 # Only care about the first 12 characters.
235 enc = orig_enc[:12].lower().replace("_", "-")
236 if enc == "utf-8" or enc.startswith("utf-8-"):
238 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
239 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
243 def detect_encoding(readline):
245 The detect_encoding() function is used to detect the encoding that should
246 be used to decode a Python source file. It requires one argument, readline,
247 in the same way as the tokenize() generator.
249 It will call readline a maximum of twice, and return the encoding used
250 (as a string) and a list of any lines (left as bytes) it has read
253 It detects the encoding from the presence of a utf-8 bom or an encoding
254 cookie as specified in pep-0263. If both a bom and a cookie are present, but
255 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
256 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
257 'utf-8-sig' is returned.
259 If no encoding is specified, then the default of 'utf-8' will be returned.
267 except StopIteration:
270 def find_cookie(line):
272 line_string = line.decode('ascii')
273 except UnicodeDecodeError:
275 match = cookie_re.match(line_string)
278 encoding = _get_normal_name(match.group(1))
280 codec = lookup(encoding)
282 # This behaviour mimics the Python interpreter
283 raise SyntaxError("unknown encoding: " + encoding)
286 if codec.name != 'utf-8':
287 # This behaviour mimics the Python interpreter
288 raise SyntaxError('encoding problem: utf-8')
292 first = read_or_stop()
293 if first.startswith(BOM_UTF8):
296 default = 'utf-8-sig'
300 encoding = find_cookie(first)
302 return encoding, [first]
303 if not blank_re.match(first):
304 return default, [first]
306 second = read_or_stop()
308 return default, [first]
310 encoding = find_cookie(second)
312 return encoding, [first, second]
314 return default, [first, second]
316 def untokenize(iterable):
317 """Transform tokens back into Python source code.
319 Each element returned by the iterable must be a token sequence
320 with at least two elements, a token number and token value. If
321 only two tokens are passed, the resulting output is poor.
323 Round-trip invariant for full input:
324 Untokenized source will match input source exactly
326 Round-trip invariant for limited intput:
327 # Output text will tokenize the back to the input
328 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
329 newcode = untokenize(t1)
330 readline = iter(newcode.splitlines(1)).next
331 t2 = [tok[:2] for tokin generate_tokens(readline)]
335 return ut.untokenize(iterable)
337 def generate_tokens(readline):
339 The generate_tokens() generator requires one argument, readline, which
340 must be a callable object which provides the same interface as the
341 readline() method of built-in file objects. Each call to the function
342 should return one line of input as a string. Alternately, readline
343 can be a callable function terminating with StopIteration:
344 readline = open(myfile).next # Example of alternate readline
346 The generator produces 5-tuples with these members: the token type; the
347 token string; a 2-tuple (srow, scol) of ints specifying the row and
348 column where the token begins in the source; a 2-tuple (erow, ecol) of
349 ints specifying the row and column where the token ends in the source;
350 and the line on which the token was found. The line passed is the
351 logical line; continuation lines are included.
353 lnum = parenlev = continued = 0
354 numchars = '0123456789'
355 contstr, needcont = '', 0
359 # 'stashed' and 'async_*' are used for async/await parsing
365 while 1: # loop over lines in stream
368 except StopIteration:
371 pos, max = 0, len(line)
373 if contstr: # continued string
375 raise TokenError("EOF in multi-line string", strstart)
376 endmatch = endprog.match(line)
378 pos = end = endmatch.end(0)
379 yield (STRING, contstr + line[:end],
380 strstart, (lnum, end), contline + line)
381 contstr, needcont = '', 0
383 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
384 yield (ERRORTOKEN, contstr + line,
385 strstart, (lnum, len(line)), contline)
390 contstr = contstr + line
391 contline = contline + line
394 elif parenlev == 0 and not continued: # new statement
397 while pos < max: # measure leading whitespace
398 if line[pos] == ' ': column = column + 1
399 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
400 elif line[pos] == '\f': column = 0
409 if line[pos] in '\r\n': # skip blank lines
410 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
413 if line[pos] == '#': # skip comments
414 comment_token = line[pos:].rstrip('\r\n')
415 nl_pos = pos + len(comment_token)
416 yield (COMMENT, comment_token,
417 (lnum, pos), (lnum, pos + len(comment_token)), line)
418 yield (NL, line[nl_pos:],
419 (lnum, nl_pos), (lnum, len(line)), line)
422 if column > indents[-1]: # count indents
423 indents.append(column)
424 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
426 while column < indents[-1]: # count dedents
427 if column not in indents:
428 raise IndentationError(
429 "unindent does not match any outer indentation level",
430 ("<tokenize>", lnum, pos, line))
431 indents = indents[:-1]
433 if async_def and async_def_indent >= indents[-1]:
438 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
440 if async_def and async_def_nl and async_def_indent >= indents[-1]:
445 else: # continued statement
447 raise TokenError("EOF in multi-line statement", (lnum, 0))
451 pseudomatch = pseudoprog.match(line, pos)
452 if pseudomatch: # scan for tokens
453 start, end = pseudomatch.span(1)
454 spos, epos, pos = (lnum, start), (lnum, end), end
455 token, initial = line[start:end], line[start]
457 if initial in numchars or \
458 (initial == '.' and token != '.'): # ordinary number
459 yield (NUMBER, token, spos, epos, line)
460 elif initial in '\r\n':
469 yield (newline, token, spos, epos, line)
472 assert not token.endswith("\n")
476 yield (COMMENT, token, spos, epos, line)
477 elif token in triple_quoted:
478 endprog = endprogs[token]
479 endmatch = endprog.match(line, pos)
480 if endmatch: # all on one line
481 pos = endmatch.end(0)
482 token = line[start:pos]
486 yield (STRING, token, spos, (lnum, pos), line)
488 strstart = (lnum, start) # multiple lines
489 contstr = line[start:]
492 elif initial in single_quoted or \
493 token[:2] in single_quoted or \
494 token[:3] in single_quoted:
495 if token[-1] == '\n': # continued string
496 strstart = (lnum, start)
497 endprog = (endprogs[initial] or endprogs[token[1]] or
499 contstr, needcont = line[start:], 1
502 else: # ordinary string
506 yield (STRING, token, spos, epos, line)
507 elif initial.isidentifier(): # ordinary name
508 if token in ('async', 'await'):
510 yield (ASYNC if token == 'async' else AWAIT,
511 token, spos, epos, line)
514 tok = (NAME, token, spos, epos, line)
515 if token == 'async' and not stashed:
519 if token in ('def', 'for'):
521 and stashed[0] == NAME
522 and stashed[1] == 'async'):
526 async_def_indent = indents[-1]
528 yield (ASYNC, stashed[1],
529 stashed[2], stashed[3],
538 elif initial == '\\': # continued stmt
539 # This yield is new; needed for better idempotency:
543 yield (NL, token, spos, (lnum, pos), line)
546 if initial in '([{': parenlev = parenlev + 1
547 elif initial in ')]}': parenlev = parenlev - 1
551 yield (OP, token, spos, epos, line)
553 yield (ERRORTOKEN, line[pos],
554 (lnum, pos), (lnum, pos+1), line)
561 for indent in indents[1:]: # pop remaining indent levels
562 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
563 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
565 if __name__ == '__main__': # testing
567 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
568 else: tokenize(sys.stdin.readline)