X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/2082a325fdd14f0aabd88f7f12a20f9fb085c538..738c2789cc07dbbd6bcf5908fd06627ee8d5cd6c:/src/blib2to3/pgen2/tokenize.py?ds=inline diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index bad79b2..d0607f4 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,27 +27,44 @@ are the same, except instead of generating tokens, tokeneater is a callback function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" +import sys from typing import ( Callable, + Final, Iterable, Iterator, List, Optional, - Text, - Tuple, Pattern, + Set, + Tuple, Union, cast, ) -from blib2to3.pgen2.token import * + from blib2to3.pgen2.grammar import Grammar +from blib2to3.pgen2.token import ( + ASYNC, + AWAIT, + COMMENT, + DEDENT, + ENDMARKER, + ERRORTOKEN, + INDENT, + NAME, + NEWLINE, + NL, + NUMBER, + OP, + STRING, + tok_name, +) __author__ = "Ka-Ping Yee " __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" -import regex as re +import re from codecs import BOM_UTF8, lookup -from blib2to3.pgen2.token import * from . import token @@ -59,27 +76,27 @@ __all__ = [x for x in dir(token) if x[0] != "_"] + [ del token -def group(*choices): +def group(*choices: str) -> str: return "(" + "|".join(choices) + ")" -def any(*choices): +def any(*choices: str) -> str: return group(*choices) + "*" -def maybe(*choices): +def maybe(*choices: str) -> str: return group(*choices) + "?" -def _combinations(*l): - return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()) +def _combinations(*l: str) -> Set[str]: + return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()} Whitespace = r"[ \f\t]*" Comment = r"#[^\r\n]*" Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) Name = ( # this is invalid but it's fine because Name comes after Number in all groups - r"\w+" + r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" ) Binnumber = r"0[bB]_?[01]+(?:_[01]+)*" @@ -139,7 +156,7 @@ ContStr = group( PseudoExtras = group(r"\\\r?\n", Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) -pseudoprog = re.compile(PseudoToken, re.UNICODE) +pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) single3prog = re.compile(Single3) double3prog = re.compile(Double3) @@ -149,22 +166,21 @@ _strprefixes = ( | {"u", "U", "ur", "uR", "Ur", "UR"} ) -endprogs = { +endprogs: Final = { "'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, **{f"{prefix}'''": single3prog for prefix in _strprefixes}, **{f'{prefix}"""': double3prog for prefix in _strprefixes}, - **{prefix: None for prefix in _strprefixes}, } -triple_quoted = ( +triple_quoted: Final = ( {"'''", '"""'} | {f"{prefix}'''" for prefix in _strprefixes} | {f'{prefix}"""' for prefix in _strprefixes} ) -single_quoted = ( +single_quoted: Final = ( {"'", '"'} | {f"{prefix}'" for prefix in _strprefixes} | {f'{prefix}"' for prefix in _strprefixes} @@ -181,19 +197,23 @@ class StopTokenizing(Exception): pass -def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing - (srow, scol) = xxx_todo_changeme - (erow, ecol) = xxx_todo_changeme1 +Coord = Tuple[int, int] + + +def printtoken( + type: int, token: str, srow_col: Coord, erow_col: Coord, line: str +) -> None: # for testing + (srow, scol) = srow_col + (erow, ecol) = erow_col print( "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) ) -Coord = Tuple[int, int] -TokenEater = Callable[[int, Text, Coord, Coord, Text], None] +TokenEater = Callable[[int, str, Coord, Coord, str], None] -def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None: +def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None: """ The tokenize() function accepts two parameters: one representing the input stream, and one providing an output mechanism for tokenize(). @@ -213,18 +233,17 @@ def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) # backwards compatible interface -def tokenize_loop(readline, tokeneater): +def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None: for token_info in generate_tokens(readline): tokeneater(*token_info) -GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text] +GoodTokenInfo = Tuple[int, str, Coord, Coord, str] TokenInfo = Union[Tuple[int, str], GoodTokenInfo] class Untokenizer: - - tokens: List[Text] + tokens: List[str] prev_row: int prev_col: int @@ -240,13 +259,13 @@ class Untokenizer: if col_offset: self.tokens.append(" " * col_offset) - def untokenize(self, iterable: Iterable[TokenInfo]) -> Text: + def untokenize(self, iterable: Iterable[TokenInfo]) -> str: for t in iterable: if len(t) == 2: self.compat(cast(Tuple[int, str], t), iterable) break tok_type, token, start, end, line = cast( - Tuple[int, Text, Coord, Coord, Text], t + Tuple[int, str, Coord, Coord, str], t ) self.add_whitespace(start) self.tokens.append(token) @@ -256,7 +275,7 @@ class Untokenizer: self.prev_col = 0 return "".join(self.tokens) - def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None: + def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None: startline = False indents = [] toks_append = self.tokens.append @@ -286,7 +305,7 @@ class Untokenizer: cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII) -blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) +blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) def _get_normal_name(orig_enc: str) -> str: @@ -328,7 +347,7 @@ def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]: try: return readline() except StopIteration: - return bytes() + return b"" def find_cookie(line: bytes) -> Optional[str]: try: @@ -377,7 +396,7 @@ def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]: return default, [first, second] -def untokenize(iterable: Iterable[TokenInfo]) -> Text: +def untokenize(iterable: Iterable[TokenInfo]) -> str: """Transform tokens back into Python source code. Each element returned by the iterable must be a token sequence @@ -400,7 +419,7 @@ def untokenize(iterable: Iterable[TokenInfo]) -> Text: def generate_tokens( - readline: Callable[[], Text], grammar: Optional[Grammar] = None + readline: Callable[[], str], grammar: Optional[Grammar] = None ) -> Iterator[GoodTokenInfo]: """ The generate_tokens() generator requires one argument, readline, which @@ -418,7 +437,7 @@ def generate_tokens( logical line; continuation lines are included. """ lnum = parenlev = continued = 0 - numchars = "0123456789" + numchars: Final[str] = "0123456789" contstr, needcont = "", 0 contline: Optional[str] = None indents = [0] @@ -427,7 +446,7 @@ def generate_tokens( # `await` as keywords. async_keywords = False if grammar is None else grammar.async_keywords # 'stashed' and 'async_*' are used for async/await parsing - stashed = None + stashed: Optional[GoodTokenInfo] = None async_def = False async_def_indent = 0 async_def_nl = False @@ -440,7 +459,7 @@ def generate_tokens( line = readline() except StopIteration: line = "" - lnum = lnum + 1 + lnum += 1 pos, max = 0, len(line) if contstr: # continued string @@ -481,14 +500,14 @@ def generate_tokens( column = 0 while pos < max: # measure leading whitespace if line[pos] == " ": - column = column + 1 + column += 1 elif line[pos] == "\t": column = (column // tabsize + 1) * tabsize elif line[pos] == "\f": column = 0 else: break - pos = pos + 1 + pos += 1 if pos == max: break @@ -507,7 +526,7 @@ def generate_tokens( COMMENT, comment_token, (lnum, pos), - (lnum, pos + len(comment_token)), + (lnum, nl_pos), line, ) yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) @@ -592,11 +611,15 @@ def generate_tokens( ): if token[-1] == "\n": # continued string strstart = (lnum, start) - endprog = ( - endprogs[initial] - or endprogs[token[1]] - or endprogs[token[2]] + maybe_endprog = ( + endprogs.get(initial) + or endprogs.get(token[1]) + or endprogs.get(token[2]) ) + assert ( + maybe_endprog is not None + ), f"endprog not found for {token}" + endprog = maybe_endprog contstr, needcont = line[start:], 1 contline = line break @@ -624,7 +647,6 @@ def generate_tokens( if token in ("def", "for"): if stashed and stashed[0] == NAME and stashed[1] == "async": - if token == "def": async_def = True async_def_indent = indents[-1] @@ -652,29 +674,27 @@ def generate_tokens( continued = 1 else: if initial in "([{": - parenlev = parenlev + 1 + parenlev += 1 elif initial in ")]}": - parenlev = parenlev - 1 + parenlev -= 1 if stashed: yield stashed stashed = None yield (OP, token, spos, epos, line) else: yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) - pos = pos + 1 + pos += 1 if stashed: yield stashed stashed = None - for indent in indents[1:]: # pop remaining indent levels + for _indent in indents[1:]: # pop remaining indent levels yield (DEDENT, "", (lnum, 0), (lnum, 0), "") yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") if __name__ == "__main__": # testing - import sys - if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) else: