X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/2082a325fdd14f0aabd88f7f12a20f9fb085c538..9e9fdce9a81a53fd3771e1825de523a6413b3c35:/src/blib2to3/pgen2/tokenize.py?ds=sidebyside

diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py
index bad79b2..d0607f4 100644
--- a/src/blib2to3/pgen2/tokenize.py
+++ b/src/blib2to3/pgen2/tokenize.py
@@ -27,27 +27,44 @@ are the same, except instead of generating tokens, tokeneater is a callback
 function to which the 5 fields described above are passed as 5 arguments,
 each time a new token is found."""
 
+import sys
 from typing import (
     Callable,
+    Final,
     Iterable,
     Iterator,
     List,
     Optional,
-    Text,
-    Tuple,
     Pattern,
+    Set,
+    Tuple,
     Union,
     cast,
 )
-from blib2to3.pgen2.token import *
+
 from blib2to3.pgen2.grammar import Grammar
+from blib2to3.pgen2.token import (
+    ASYNC,
+    AWAIT,
+    COMMENT,
+    DEDENT,
+    ENDMARKER,
+    ERRORTOKEN,
+    INDENT,
+    NAME,
+    NEWLINE,
+    NL,
+    NUMBER,
+    OP,
+    STRING,
+    tok_name,
+)
 
 __author__ = "Ka-Ping Yee <ping@lfw.org>"
 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
 
-import regex as re
+import re
 from codecs import BOM_UTF8, lookup
-from blib2to3.pgen2.token import *
 
 from . import token
 
@@ -59,27 +76,27 @@ __all__ = [x for x in dir(token) if x[0] != "_"] + [
 del token
 
 
-def group(*choices):
+def group(*choices: str) -> str:
     return "(" + "|".join(choices) + ")"
 
 
-def any(*choices):
+def any(*choices: str) -> str:
     return group(*choices) + "*"
 
 
-def maybe(*choices):
+def maybe(*choices: str) -> str:
     return group(*choices) + "?"
 
 
-def _combinations(*l):
-    return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
+def _combinations(*l: str) -> Set[str]:
+    return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
 
 
 Whitespace = r"[ \f\t]*"
 Comment = r"#[^\r\n]*"
 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
-    r"\w+"
+    r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
 )
 
 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
@@ -139,7 +156,7 @@ ContStr = group(
 PseudoExtras = group(r"\\\r?\n", Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 
-pseudoprog = re.compile(PseudoToken, re.UNICODE)
+pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
 single3prog = re.compile(Single3)
 double3prog = re.compile(Double3)
 
@@ -149,22 +166,21 @@ _strprefixes = (
     | {"u", "U", "ur", "uR", "Ur", "UR"}
 )
 
-endprogs = {
+endprogs: Final = {
     "'": re.compile(Single),
     '"': re.compile(Double),
     "'''": single3prog,
     '"""': double3prog,
     **{f"{prefix}'''": single3prog for prefix in _strprefixes},
     **{f'{prefix}"""': double3prog for prefix in _strprefixes},
-    **{prefix: None for prefix in _strprefixes},
 }
 
-triple_quoted = (
+triple_quoted: Final = (
     {"'''", '"""'}
     | {f"{prefix}'''" for prefix in _strprefixes}
     | {f'{prefix}"""' for prefix in _strprefixes}
 )
-single_quoted = (
+single_quoted: Final = (
     {"'", '"'}
     | {f"{prefix}'" for prefix in _strprefixes}
     | {f'{prefix}"' for prefix in _strprefixes}
@@ -181,19 +197,23 @@ class StopTokenizing(Exception):
     pass
 
 
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
-    (srow, scol) = xxx_todo_changeme
-    (erow, ecol) = xxx_todo_changeme1
+Coord = Tuple[int, int]
+
+
+def printtoken(
+    type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
+) -> None:  # for testing
+    (srow, scol) = srow_col
+    (erow, ecol) = erow_col
     print(
         "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
     )
 
 
-Coord = Tuple[int, int]
-TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
+TokenEater = Callable[[int, str, Coord, Coord, str], None]
 
 
-def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
+def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
     """
     The tokenize() function accepts two parameters: one representing the
     input stream, and one providing an output mechanism for tokenize().
@@ -213,18 +233,17 @@ def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken)
 
 
 # backwards compatible interface
-def tokenize_loop(readline, tokeneater):
+def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
     for token_info in generate_tokens(readline):
         tokeneater(*token_info)
 
 
-GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
+GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
 TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
 
 
 class Untokenizer:
-
-    tokens: List[Text]
+    tokens: List[str]
     prev_row: int
     prev_col: int
 
@@ -240,13 +259,13 @@ class Untokenizer:
         if col_offset:
             self.tokens.append(" " * col_offset)
 
-    def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
+    def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
         for t in iterable:
             if len(t) == 2:
                 self.compat(cast(Tuple[int, str], t), iterable)
                 break
             tok_type, token, start, end, line = cast(
-                Tuple[int, Text, Coord, Coord, Text], t
+                Tuple[int, str, Coord, Coord, str], t
             )
             self.add_whitespace(start)
             self.tokens.append(token)
@@ -256,7 +275,7 @@ class Untokenizer:
                 self.prev_col = 0
         return "".join(self.tokens)
 
-    def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
+    def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
         startline = False
         indents = []
         toks_append = self.tokens.append
@@ -286,7 +305,7 @@ class Untokenizer:
 
 
 cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
-blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
 
 
 def _get_normal_name(orig_enc: str) -> str:
@@ -328,7 +347,7 @@ def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
         try:
             return readline()
         except StopIteration:
-            return bytes()
+            return b""
 
     def find_cookie(line: bytes) -> Optional[str]:
         try:
@@ -377,7 +396,7 @@ def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
     return default, [first, second]
 
 
-def untokenize(iterable: Iterable[TokenInfo]) -> Text:
+def untokenize(iterable: Iterable[TokenInfo]) -> str:
     """Transform tokens back into Python source code.
 
     Each element returned by the iterable must be a token sequence
@@ -400,7 +419,7 @@ def untokenize(iterable: Iterable[TokenInfo]) -> Text:
 
 
 def generate_tokens(
-    readline: Callable[[], Text], grammar: Optional[Grammar] = None
+    readline: Callable[[], str], grammar: Optional[Grammar] = None
 ) -> Iterator[GoodTokenInfo]:
     """
     The generate_tokens() generator requires one argument, readline, which
@@ -418,7 +437,7 @@ def generate_tokens(
     logical line; continuation lines are included.
     """
     lnum = parenlev = continued = 0
-    numchars = "0123456789"
+    numchars: Final[str] = "0123456789"
     contstr, needcont = "", 0
     contline: Optional[str] = None
     indents = [0]
@@ -427,7 +446,7 @@ def generate_tokens(
     # `await` as keywords.
     async_keywords = False if grammar is None else grammar.async_keywords
     # 'stashed' and 'async_*' are used for async/await parsing
-    stashed = None
+    stashed: Optional[GoodTokenInfo] = None
     async_def = False
     async_def_indent = 0
     async_def_nl = False
@@ -440,7 +459,7 @@ def generate_tokens(
             line = readline()
         except StopIteration:
             line = ""
-        lnum = lnum + 1
+        lnum += 1
         pos, max = 0, len(line)
 
         if contstr:  # continued string
@@ -481,14 +500,14 @@ def generate_tokens(
             column = 0
             while pos < max:  # measure leading whitespace
                 if line[pos] == " ":
-                    column = column + 1
+                    column += 1
                 elif line[pos] == "\t":
                     column = (column // tabsize + 1) * tabsize
                 elif line[pos] == "\f":
                     column = 0
                 else:
                     break
-                pos = pos + 1
+                pos += 1
             if pos == max:
                 break
 
@@ -507,7 +526,7 @@ def generate_tokens(
                     COMMENT,
                     comment_token,
                     (lnum, pos),
-                    (lnum, pos + len(comment_token)),
+                    (lnum, nl_pos),
                     line,
                 )
                 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
@@ -592,11 +611,15 @@ def generate_tokens(
                 ):
                     if token[-1] == "\n":  # continued string
                         strstart = (lnum, start)
-                        endprog = (
-                            endprogs[initial]
-                            or endprogs[token[1]]
-                            or endprogs[token[2]]
+                        maybe_endprog = (
+                            endprogs.get(initial)
+                            or endprogs.get(token[1])
+                            or endprogs.get(token[2])
                         )
+                        assert (
+                            maybe_endprog is not None
+                        ), f"endprog not found for {token}"
+                        endprog = maybe_endprog
                         contstr, needcont = line[start:], 1
                         contline = line
                         break
@@ -624,7 +647,6 @@ def generate_tokens(
 
                     if token in ("def", "for"):
                         if stashed and stashed[0] == NAME and stashed[1] == "async":
-
                             if token == "def":
                                 async_def = True
                                 async_def_indent = indents[-1]
@@ -652,29 +674,27 @@ def generate_tokens(
                     continued = 1
                 else:
                     if initial in "([{":
-                        parenlev = parenlev + 1
+                        parenlev += 1
                     elif initial in ")]}":
-                        parenlev = parenlev - 1
+                        parenlev -= 1
                     if stashed:
                         yield stashed
                         stashed = None
                     yield (OP, token, spos, epos, line)
             else:
                 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
-                pos = pos + 1
+                pos += 1
 
     if stashed:
         yield stashed
         stashed = None
 
-    for indent in indents[1:]:  # pop remaining indent levels
+    for _indent in indents[1:]:  # pop remaining indent levels
         yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
     yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
 
 
 if __name__ == "__main__":  # testing
-    import sys
-
     if len(sys.argv) > 1:
         tokenize(open(sys.argv[1]).readline)
     else: