function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
+import sys
from typing import (
Callable,
+ Final,
Iterable,
Iterator,
List,
Optional,
- Text,
- Tuple,
Pattern,
+ Set,
+ Tuple,
Union,
cast,
)
-from blib2to3.pgen2.token import *
+
from blib2to3.pgen2.grammar import Grammar
+from blib2to3.pgen2.token import (
+ ASYNC,
+ AWAIT,
+ COMMENT,
+ DEDENT,
+ ENDMARKER,
+ ERRORTOKEN,
+ INDENT,
+ NAME,
+ NEWLINE,
+ NL,
+ NUMBER,
+ OP,
+ STRING,
+ tok_name,
+)
__author__ = "Ka-Ping Yee <ping@lfw.org>"
__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
-import regex as re
+import re
from codecs import BOM_UTF8, lookup
-from blib2to3.pgen2.token import *
from . import token
del token
-def group(*choices):
+def group(*choices: str) -> str:
return "(" + "|".join(choices) + ")"
-def any(*choices):
+def any(*choices: str) -> str:
return group(*choices) + "*"
-def maybe(*choices):
+def maybe(*choices: str) -> str:
return group(*choices) + "?"
-def _combinations(*l):
- return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
+def _combinations(*l: str) -> Set[str]:
+ return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
Whitespace = r"[ \f\t]*"
Comment = r"#[^\r\n]*"
Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
Name = ( # this is invalid but it's fine because Name comes after Number in all groups
- r"\w+"
+ r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
)
Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-pseudoprog = re.compile(PseudoToken, re.UNICODE)
+pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)
| {"u", "U", "ur", "uR", "Ur", "UR"}
)
-endprogs = {
+endprogs: Final = {
"'": re.compile(Single),
'"': re.compile(Double),
"'''": single3prog,
'"""': double3prog,
**{f"{prefix}'''": single3prog for prefix in _strprefixes},
**{f'{prefix}"""': double3prog for prefix in _strprefixes},
- **{prefix: None for prefix in _strprefixes},
}
-triple_quoted = (
+triple_quoted: Final = (
{"'''", '"""'}
| {f"{prefix}'''" for prefix in _strprefixes}
| {f'{prefix}"""' for prefix in _strprefixes}
)
-single_quoted = (
+single_quoted: Final = (
{"'", '"'}
| {f"{prefix}'" for prefix in _strprefixes}
| {f'{prefix}"' for prefix in _strprefixes}
pass
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
- (srow, scol) = xxx_todo_changeme
- (erow, ecol) = xxx_todo_changeme1
+Coord = Tuple[int, int]
+
+
+def printtoken(
+ type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
+) -> None: # for testing
+ (srow, scol) = srow_col
+ (erow, ecol) = erow_col
print(
"%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
)
-Coord = Tuple[int, int]
-TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
+TokenEater = Callable[[int, str, Coord, Coord, str], None]
-def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
+def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
# backwards compatible interface
-def tokenize_loop(readline, tokeneater):
+def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
for token_info in generate_tokens(readline):
tokeneater(*token_info)
-GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
+GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
class Untokenizer:
-
- tokens: List[Text]
+ tokens: List[str]
prev_row: int
prev_col: int
if col_offset:
self.tokens.append(" " * col_offset)
- def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
+ def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
for t in iterable:
if len(t) == 2:
self.compat(cast(Tuple[int, str], t), iterable)
break
tok_type, token, start, end, line = cast(
- Tuple[int, Text, Coord, Coord, Text], t
+ Tuple[int, str, Coord, Coord, str], t
)
self.add_whitespace(start)
self.tokens.append(token)
self.prev_col = 0
return "".join(self.tokens)
- def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
+ def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
startline = False
indents = []
toks_append = self.tokens.append
cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
-blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
def _get_normal_name(orig_enc: str) -> str:
try:
return readline()
except StopIteration:
- return bytes()
+ return b""
def find_cookie(line: bytes) -> Optional[str]:
try:
return default, [first, second]
-def untokenize(iterable: Iterable[TokenInfo]) -> Text:
+def untokenize(iterable: Iterable[TokenInfo]) -> str:
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
def generate_tokens(
- readline: Callable[[], Text], grammar: Optional[Grammar] = None
+ readline: Callable[[], str], grammar: Optional[Grammar] = None
) -> Iterator[GoodTokenInfo]:
"""
The generate_tokens() generator requires one argument, readline, which
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
- numchars = "0123456789"
+ numchars: Final[str] = "0123456789"
contstr, needcont = "", 0
contline: Optional[str] = None
indents = [0]
# `await` as keywords.
async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing
- stashed = None
+ stashed: Optional[GoodTokenInfo] = None
async_def = False
async_def_indent = 0
async_def_nl = False
line = readline()
except StopIteration:
line = ""
- lnum = lnum + 1
+ lnum += 1
pos, max = 0, len(line)
if contstr: # continued string
column = 0
while pos < max: # measure leading whitespace
if line[pos] == " ":
- column = column + 1
+ column += 1
elif line[pos] == "\t":
column = (column // tabsize + 1) * tabsize
elif line[pos] == "\f":
column = 0
else:
break
- pos = pos + 1
+ pos += 1
if pos == max:
break
COMMENT,
comment_token,
(lnum, pos),
- (lnum, pos + len(comment_token)),
+ (lnum, nl_pos),
line,
)
yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
):
if token[-1] == "\n": # continued string
strstart = (lnum, start)
- endprog = (
- endprogs[initial]
- or endprogs[token[1]]
- or endprogs[token[2]]
+ maybe_endprog = (
+ endprogs.get(initial)
+ or endprogs.get(token[1])
+ or endprogs.get(token[2])
)
+ assert (
+ maybe_endprog is not None
+ ), f"endprog not found for {token}"
+ endprog = maybe_endprog
contstr, needcont = line[start:], 1
contline = line
break
if token in ("def", "for"):
if stashed and stashed[0] == NAME and stashed[1] == "async":
-
if token == "def":
async_def = True
async_def_indent = indents[-1]
continued = 1
else:
if initial in "([{":
- parenlev = parenlev + 1
+ parenlev += 1
elif initial in ")]}":
- parenlev = parenlev - 1
+ parenlev -= 1
if stashed:
yield stashed
stashed = None
yield (OP, token, spos, epos, line)
else:
yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
- pos = pos + 1
+ pos += 1
if stashed:
yield stashed
stashed = None
- for indent in indents[1:]: # pop remaining indent levels
+ for _indent in indents[1:]: # pop remaining indent levels
yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
if __name__ == "__main__": # testing
- import sys
-
if len(sys.argv) > 1:
tokenize(open(sys.argv[1]).readline)
else: