X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/e36b8c71bb78f70735b8b4f239b0f574a6e0f277..f8617f975d56e81cfb4070ce65584f7b29a77e7a:/blib2to3/pgen2/tokenize.py

diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py
index 6b8a5cb..43e1d59 100644
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -29,8 +29,9 @@ __author__ = 'Ka-Ping Yee <ping@lfw.org>'
 __credits__ = \
     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 
-import string, re, unicodedata
+import re
 from codecs import BOM_UTF8, lookup
+from attr import dataclass
 from blib2to3.pgen2.token import *
 
 from . import token
@@ -48,11 +49,15 @@ except NameError:
 def group(*choices): return '(' + '|'.join(choices) + ')'
 def any(*choices): return group(*choices) + '*'
 def maybe(*choices): return group(*choices) + '?'
+def _combinations(*l):
+    return set(
+        x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
+    )
 
 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[^\d\W]\w*'
+Name = r'\w+'  # this is invalid but it's fine because Name comes after Number in all groups
 
 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@@ -74,7 +79,7 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
+_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
 Triple = group(_litprefix + "'''", _litprefix + '"""')
 # Single-line ' or " string.
 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
@@ -107,62 +112,36 @@ tokenprog = re.compile(Token, re.UNICODE)
 pseudoprog = re.compile(PseudoToken, re.UNICODE)
 single3prog = re.compile(Single3)
 double3prog = re.compile(Double3)
+
+_strprefixes = (
+    _combinations('r', 'R', 'f', 'F') |
+    _combinations('r', 'R', 'b', 'B') |
+    {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
+)
+
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
             "'''": single3prog, '"""': double3prog,
-            "r'''": single3prog, 'r"""': double3prog,
-            "u'''": single3prog, 'u"""': double3prog,
-            "b'''": single3prog, 'b"""': double3prog,
-            "f'''": single3prog, 'f"""': double3prog,
-            "ur'''": single3prog, 'ur"""': double3prog,
-            "br'''": single3prog, 'br"""': double3prog,
-            "rb'''": single3prog, 'rb"""': double3prog,
-            "R'''": single3prog, 'R"""': double3prog,
-            "U'''": single3prog, 'U"""': double3prog,
-            "B'''": single3prog, 'B"""': double3prog,
-            "F'''": single3prog, 'F"""': double3prog,
-            "uR'''": single3prog, 'uR"""': double3prog,
-            "Ur'''": single3prog, 'Ur"""': double3prog,
-            "UR'''": single3prog, 'UR"""': double3prog,
-            "bR'''": single3prog, 'bR"""': double3prog,
-            "Br'''": single3prog, 'Br"""': double3prog,
-            "BR'''": single3prog, 'BR"""': double3prog,
-            "rB'''": single3prog, 'rB"""': double3prog,
-            "Rb'''": single3prog, 'Rb"""': double3prog,
-            "RB'''": single3prog, 'RB"""': double3prog,
-            'r': None, 'R': None,
-            'u': None, 'U': None,
-            'f': None, 'F': None,
-            'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "u'''", 'u"""', "U'''", 'U"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "f'''", 'f"""', "F'''", 'F"""',
-          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
-          "uR'''", 'uR"""', "UR'''", 'UR"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""',
-          "rb'''", 'rb"""', "Rb'''", 'Rb"""',
-          "rB'''", 'rB"""', "RB'''", 'RB"""',):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "u'", 'u"', "U'", 'U"',
-          "b'", 'b"', "B'", 'B"',
-          "f'", 'f"', "F'", 'F"',
-          "ur'", 'ur"', "Ur'", 'Ur"',
-          "uR'", 'uR"', "UR'", 'UR"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"',
-          "rb'", 'rb"', "Rb'", 'Rb"',
-          "rB'", 'rB"', "RB'", 'RB"',):
-    single_quoted[t] = t
+            **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+            **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+            **{prefix: None for prefix in _strprefixes}}
+
+triple_quoted = (
+    {"'''", '"""'} |
+    {f"{prefix}'''" for prefix in _strprefixes} |
+    {f'{prefix}"""' for prefix in _strprefixes}
+)
+single_quoted = (
+    {"'", '"'} |
+    {f"{prefix}'" for prefix in _strprefixes} |
+    {f'{prefix}"' for prefix in _strprefixes}
+)
 
 tabsize = 8
 
+@dataclass(frozen=True)
+class TokenizerConfig:
+    async_is_reserved_keyword: bool = False
+
 class TokenError(Exception): pass
 
 class StopTokenizing(Exception): pass
@@ -360,9 +339,7 @@ def untokenize(iterable):
     ut = Untokenizer()
     return ut.untokenize(iterable)
 
-InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
-
-def generate_tokens(readline):
+def generate_tokens(readline, config: TokenizerConfig = TokenizerConfig()):
     """
     The generate_tokens() generator requires one argument, readline, which
     must be a callable object which provides the same interface as the
@@ -379,11 +356,14 @@ def generate_tokens(readline):
     logical line; continuation lines are included.
     """
     lnum = parenlev = continued = 0
-    namechars, numchars = string.ascii_letters + '_', '0123456789'
+    numchars = '0123456789'
     contstr, needcont = '', 0
     contline = None
     indents = [0]
 
+    # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+    # `await` as keywords.
+    async_is_reserved_keyword = config.async_is_reserved_keyword
     # 'stashed' and 'async_*' are used for async/await parsing
     stashed = None
     async_def = False
@@ -438,10 +418,6 @@ def generate_tokens(readline):
                 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                 continue
 
-            if column > indents[-1]:           # count indents
-                indents.append(column)
-                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-
             if line[pos] == '#':               # skip comments
                 comment_token = line[pos:].rstrip('\r\n')
                 nl_pos = pos + len(comment_token)
@@ -451,6 +427,10 @@ def generate_tokens(readline):
                         (lnum, nl_pos), (lnum, len(line)), line)
                 continue
 
+            if column > indents[-1]:           # count indents
+                indents.append(column)
+                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+
             while column < indents[-1]:        # count dedents
                 if column not in indents:
                     raise IndentationError(
@@ -477,8 +457,6 @@ def generate_tokens(readline):
 
         while pos < max:
             pseudomatch = pseudoprog.match(line, pos)
-            if not pseudomatch:
-                print('no pseudomatch')
             if pseudomatch:                                # scan for tokens
                 start, end = pseudomatch.span(1)
                 spos, epos, pos = (lnum, start), (lnum, end), end
@@ -534,10 +512,9 @@ def generate_tokens(readline):
                             yield stashed
                             stashed = None
                         yield (STRING, token, spos, epos, line)
-                elif (initial in namechars or              # ordinary name
-                      unicodedata.category(initial) in InitialCategories):
+                elif initial.isidentifier():               # ordinary name
                     if token in ('async', 'await'):
-                        if async_def:
+                        if async_is_reserved_keyword or async_def:
                             yield (ASYNC if token == 'async' else AWAIT,
                                    token, spos, epos, line)
                             continue
@@ -547,13 +524,14 @@ def generate_tokens(readline):
                         stashed = tok
                         continue
 
-                    if token == 'def':
+                    if token in ('def', 'for'):
                         if (stashed
                                 and stashed[0] == NAME
                                 and stashed[1] == 'async'):
 
-                            async_def = True
-                            async_def_indent = indents[-1]
+                            if token == 'def':
+                                async_def = True
+                                async_def_indent = indents[-1]
 
                             yield (ASYNC, stashed[1],
                                    stashed[2], stashed[3],