# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
+# mypy: allow-untyped-defs, allow-untyped-calls
+
"""Tokenization help for Python programs.
generate_tokens(readline) is a generator that breaks a stream of
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = \
- 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
+from typing import (
+ Callable,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Text,
+ Tuple,
+ Pattern,
+ Union,
+ cast,
+)
+from blib2to3.pgen2.token import *
+from blib2to3.pgen2.grammar import Grammar
+
+__author__ = "Ka-Ping Yee <ping@lfw.org>"
+__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
-import re
+import regex as re
from codecs import BOM_UTF8, lookup
from blib2to3.pgen2.token import *
from . import token
-__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
- "generate_tokens", "untokenize"]
+
+__all__ = [x for x in dir(token) if x[0] != "_"] + [
+ "tokenize",
+ "generate_tokens",
+ "untokenize",
+]
del token
-try:
- bytes
-except NameError:
- # Support bytes type in Python <= 2.5, so 2to3 turns itself into
- # valid Python 3 code.
- bytes = str
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
+def group(*choices):
+ return "(" + "|".join(choices) + ")"
+
+
+def any(*choices):
+ return group(*choices) + "*"
+
+
+def maybe(*choices):
+ return group(*choices) + "?"
+
+
def _combinations(*l):
- return set(
- x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
- )
+ return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
+
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups
+Whitespace = r"[ \f\t]*"
+Comment = r"#[^\r\n]*"
+Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
+Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
-Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
-Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
-Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
-Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
+Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
+Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
+Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
+Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+(?:_\d+)*'
-Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
-Expfloat = r'\d+(?:_\d+)*' + Exponent
+Exponent = r"[eE][-+]?\d+(?:_\d+)*"
+Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
+ Exponent
+)
+Expfloat = r"\d+(?:_\d+)*" + Exponent
Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Triple = group(_litprefix + "'''", _litprefix + '"""')
# Single-line ' or " string.
-String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(
+ _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
+)
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
+Operator = group(
+ r"\*\*=?",
+ r">>=?",
+ r"<<=?",
+ r"<>",
+ r"!=",
+ r"//=?",
+ r"->",
+ r"[+\-*/%&@|^=<>:]=?",
+ r"~",
+)
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'[:;.,`@]')
+Bracket = "[][(){}]"
+Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
-ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
- group("'", r'\\\r?\n'),
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
- group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+ContStr = group(
+ _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
+ _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
+)
+PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
tokenprog = re.compile(Token, re.UNICODE)
double3prog = re.compile(Double3)
_strprefixes = (
- _combinations('r', 'R', 'f', 'F') |
- _combinations('r', 'R', 'b', 'B') |
- {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
+ _combinations("r", "R", "f", "F")
+ | _combinations("r", "R", "b", "B")
+ | {"u", "U", "ur", "uR", "Ur", "UR"}
)
-endprogs = {"'": re.compile(Single), '"': re.compile(Double),
- "'''": single3prog, '"""': double3prog,
- **{f"{prefix}'''": single3prog for prefix in _strprefixes},
- **{f'{prefix}"""': double3prog for prefix in _strprefixes},
- **{prefix: None for prefix in _strprefixes}}
+endprogs = {
+ "'": re.compile(Single),
+ '"': re.compile(Double),
+ "'''": single3prog,
+ '"""': double3prog,
+ **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+ **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+ **{prefix: None for prefix in _strprefixes},
+}
triple_quoted = (
- {"'''", '"""'} |
- {f"{prefix}'''" for prefix in _strprefixes} |
- {f'{prefix}"""' for prefix in _strprefixes}
+ {"'''", '"""'}
+ | {f"{prefix}'''" for prefix in _strprefixes}
+ | {f'{prefix}"""' for prefix in _strprefixes}
)
single_quoted = (
- {"'", '"'} |
- {f"{prefix}'" for prefix in _strprefixes} |
- {f'{prefix}"' for prefix in _strprefixes}
+ {"'", '"'}
+ | {f"{prefix}'" for prefix in _strprefixes}
+ | {f'{prefix}"' for prefix in _strprefixes}
)
tabsize = 8
-class TokenError(Exception): pass
-class StopTokenizing(Exception): pass
+class TokenError(Exception):
+ pass
+
+
+class StopTokenizing(Exception):
+ pass
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
+
+def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
(srow, scol) = xxx_todo_changeme
(erow, ecol) = xxx_todo_changeme1
- print("%d,%d-%d,%d:\t%s\t%s" % \
- (srow, scol, erow, ecol, tok_name[type], repr(token)))
+ print(
+ "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
+ )
-def tokenize(readline, tokeneater=printtoken):
+
+Coord = Tuple[int, int]
+TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
+
+
+def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
except StopTokenizing:
pass
+
# backwards compatible interface
def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
+
+GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
+TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
+
+
class Untokenizer:
- def __init__(self):
+ tokens: List[Text]
+ prev_row: int
+ prev_col: int
+
+ def __init__(self) -> None:
self.tokens = []
self.prev_row = 1
self.prev_col = 0
- def add_whitespace(self, start):
+ def add_whitespace(self, start: Coord) -> None:
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
- def untokenize(self, iterable):
+ def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
for t in iterable:
if len(t) == 2:
- self.compat(t, iterable)
+ self.compat(cast(Tuple[int, str], t), iterable)
break
- tok_type, token, start, end, line = t
+ tok_type, token, start, end, line = cast(
+ Tuple[int, Text, Coord, Coord, Text], t
+ )
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
self.prev_col = 0
return "".join(self.tokens)
- def compat(self, token, iterable):
+ def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
startline = False
indents = []
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
- tokval += ' '
+ tokval += " "
if toknum in (NEWLINE, NL):
startline = True
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER, ASYNC, AWAIT):
- tokval += ' '
+ tokval += " "
if toknum == INDENT:
indents.append(tokval)
startline = False
toks_append(tokval)
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-def _get_normal_name(orig_enc):
+cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
+blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+
+
+def _get_normal_name(orig_enc: str) -> str:
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
- if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
- enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+ if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
+ ("latin-1-", "iso-8859-1-", "iso-latin-1-")
+ ):
return "iso-8859-1"
return orig_enc
-def detect_encoding(readline):
+
+def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
"""
The detect_encoding() function is used to detect the encoding that should
be used to decode a Python source file. It requires one argument, readline,
"""
bom_found = False
encoding = None
- default = 'utf-8'
- def read_or_stop():
+ default = "utf-8"
+
+ def read_or_stop() -> bytes:
try:
return readline()
except StopIteration:
return bytes()
- def find_cookie(line):
+ def find_cookie(line: bytes) -> Optional[str]:
try:
- line_string = line.decode('ascii')
+ line_string = line.decode("ascii")
except UnicodeDecodeError:
return None
match = cookie_re.match(line_string)
raise SyntaxError("unknown encoding: " + encoding)
if bom_found:
- if codec.name != 'utf-8':
+ if codec.name != "utf-8":
# This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- encoding += '-sig'
+ raise SyntaxError("encoding problem: utf-8")
+ encoding += "-sig"
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
- default = 'utf-8-sig'
+ default = "utf-8-sig"
if not first:
return default, []
return default, [first, second]
-def untokenize(iterable):
+
+def untokenize(iterable: Iterable[TokenInfo]) -> Text:
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
Round-trip invariant for full input:
Untokenized source will match input source exactly
- Round-trip invariant for limited intput:
+ Round-trip invariant for limited input:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
ut = Untokenizer()
return ut.untokenize(iterable)
-def generate_tokens(readline):
+
+def generate_tokens(
+ readline: Callable[[], Text], grammar: Optional[Grammar] = None
+) -> Iterator[GoodTokenInfo]:
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
- numchars = '0123456789'
- contstr, needcont = '', 0
- contline = None
+ numchars = "0123456789"
+ contstr, needcont = "", 0
+ contline: Optional[str] = None
indents = [0]
+ # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+ # `await` as keywords.
+ async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing
stashed = None
async_def = False
async_def_indent = 0
async_def_nl = False
- while 1: # loop over lines in stream
+ strstart: Tuple[int, int]
+ endprog: Pattern[str]
+
+ while 1: # loop over lines in stream
try:
line = readline()
except StopIteration:
- line = ''
+ line = ""
lnum = lnum + 1
pos, max = 0, len(line)
- if contstr: # continued string
+ if contstr: # continued string
+ assert contline is not None
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if endmatch:
pos = end = endmatch.end(0)
- yield (STRING, contstr + line[:end],
- strstart, (lnum, end), contline + line)
- contstr, needcont = '', 0
+ yield (
+ STRING,
+ contstr + line[:end],
+ strstart,
+ (lnum, end),
+ contline + line,
+ )
+ contstr, needcont = "", 0
contline = None
- elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
- yield (ERRORTOKEN, contstr + line,
- strstart, (lnum, len(line)), contline)
- contstr = ''
+ elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
+ yield (
+ ERRORTOKEN,
+ contstr + line,
+ strstart,
+ (lnum, len(line)),
+ contline,
+ )
+ contstr = ""
contline = None
continue
else:
continue
elif parenlev == 0 and not continued: # new statement
- if not line: break
+ if not line:
+ break
column = 0
- while pos < max: # measure leading whitespace
- if line[pos] == ' ': column = column + 1
- elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
- elif line[pos] == '\f': column = 0
- else: break
+ while pos < max: # measure leading whitespace
+ if line[pos] == " ":
+ column = column + 1
+ elif line[pos] == "\t":
+ column = (column // tabsize + 1) * tabsize
+ elif line[pos] == "\f":
+ column = 0
+ else:
+ break
pos = pos + 1
- if pos == max: break
+ if pos == max:
+ break
if stashed:
yield stashed
stashed = None
- if line[pos] in '\r\n': # skip blank lines
+ if line[pos] in "\r\n": # skip blank lines
yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
continue
- if line[pos] == '#': # skip comments
- comment_token = line[pos:].rstrip('\r\n')
+ if line[pos] == "#": # skip comments
+ comment_token = line[pos:].rstrip("\r\n")
nl_pos = pos + len(comment_token)
- yield (COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- yield (NL, line[nl_pos:],
- (lnum, nl_pos), (lnum, len(line)), line)
+ yield (
+ COMMENT,
+ comment_token,
+ (lnum, pos),
+ (lnum, pos + len(comment_token)),
+ line,
+ )
+ yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
continue
- if column > indents[-1]: # count indents
+ if column > indents[-1]: # count indents
indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
- while column < indents[-1]: # count dedents
+ while column < indents[-1]: # count dedents
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
- ("<tokenize>", lnum, pos, line))
+ ("<tokenize>", lnum, pos, line),
+ )
indents = indents[:-1]
if async_def and async_def_indent >= indents[-1]:
async_def_nl = False
async_def_indent = 0
- yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
+ yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
- else: # continued statement
+ else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
- if pseudomatch: # scan for tokens
+ if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
- if initial in numchars or \
- (initial == '.' and token != '.'): # ordinary number
+ if initial in numchars or (
+ initial == "." and token != "."
+ ): # ordinary number
yield (NUMBER, token, spos, epos, line)
- elif initial in '\r\n':
+ elif initial in "\r\n":
newline = NEWLINE
if parenlev > 0:
newline = NL
stashed = None
yield (newline, token, spos, epos, line)
- elif initial == '#':
+ elif initial == "#":
assert not token.endswith("\n")
if stashed:
yield stashed
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
- if endmatch: # all on one line
+ if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
if stashed:
stashed = None
yield (STRING, token, spos, (lnum, pos), line)
else:
- strstart = (lnum, start) # multiple lines
+ strstart = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
- elif initial in single_quoted or \
- token[:2] in single_quoted or \
- token[:3] in single_quoted:
- if token[-1] == '\n': # continued string
+ elif (
+ initial in single_quoted
+ or token[:2] in single_quoted
+ or token[:3] in single_quoted
+ ):
+ if token[-1] == "\n": # continued string
strstart = (lnum, start)
- endprog = (endprogs[initial] or endprogs[token[1]] or
- endprogs[token[2]])
+ endprog = (
+ endprogs[initial]
+ or endprogs[token[1]]
+ or endprogs[token[2]]
+ )
contstr, needcont = line[start:], 1
contline = line
break
- else: # ordinary string
+ else: # ordinary string
if stashed:
yield stashed
stashed = None
yield (STRING, token, spos, epos, line)
- elif initial.isidentifier(): # ordinary name
- if token in ('async', 'await'):
- if async_def:
- yield (ASYNC if token == 'async' else AWAIT,
- token, spos, epos, line)
+ elif initial.isidentifier(): # ordinary name
+ if token in ("async", "await"):
+ if async_keywords or async_def:
+ yield (
+ ASYNC if token == "async" else AWAIT,
+ token,
+ spos,
+ epos,
+ line,
+ )
continue
tok = (NAME, token, spos, epos, line)
- if token == 'async' and not stashed:
+ if token == "async" and not stashed:
stashed = tok
continue
- if token == 'def':
- if (stashed
- and stashed[0] == NAME
- and stashed[1] == 'async'):
+ if token in ("def", "for"):
+ if stashed and stashed[0] == NAME and stashed[1] == "async":
- async_def = True
- async_def_indent = indents[-1]
+ if token == "def":
+ async_def = True
+ async_def_indent = indents[-1]
- yield (ASYNC, stashed[1],
- stashed[2], stashed[3],
- stashed[4])
+ yield (
+ ASYNC,
+ stashed[1],
+ stashed[2],
+ stashed[3],
+ stashed[4],
+ )
stashed = None
if stashed:
stashed = None
yield tok
- elif initial == '\\': # continued stmt
+ elif initial == "\\": # continued stmt
# This yield is new; needed for better idempotency:
if stashed:
yield stashed
yield (NL, token, spos, (lnum, pos), line)
continued = 1
else:
- if initial in '([{': parenlev = parenlev + 1
- elif initial in ')]}': parenlev = parenlev - 1
+ if initial in "([{":
+ parenlev = parenlev + 1
+ elif initial in ")]}":
+ parenlev = parenlev - 1
if stashed:
yield stashed
stashed = None
yield (OP, token, spos, epos, line)
else:
- yield (ERRORTOKEN, line[pos],
- (lnum, pos), (lnum, pos+1), line)
+ yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
pos = pos + 1
if stashed:
yield stashed
stashed = None
- for indent in indents[1:]: # pop remaining indent levels
- yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
- yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+ for indent in indents[1:]: # pop remaining indent levels
+ yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
+ yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
-if __name__ == '__main__': # testing
+
+if __name__ == "__main__": # testing
import sys
- if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
- else: tokenize(sys.stdin.readline)
+
+ if len(sys.argv) > 1:
+ tokenize(open(sys.argv[1]).readline)
+ else:
+ tokenize(sys.stdin.readline)