Move tokenizer config onto grammar, rename flag

[etc/vim.git] / blib2to3 / pgen2 / tokenize.py
diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py

index 14560e4fddff6224abbafd228229ed88605bd1a3..0912f43b867719f69e987bf1111725f81ed3693a 100644 (file)
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -29,9 +29,9 @@ __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  __credits__ = \
      'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  
-import string, re
+import re
  from codecs import BOM_UTF8, lookup
-from lib2to3.pgen2.token import *
+from blib2to3.pgen2.token import *
  
  from . import token
  __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
@@ -48,11 +48,15 @@ except NameError:
  def group(*choices): return '(' + '|'.join(choices) + ')'
  def any(*choices): return group(*choices) + '*'
  def maybe(*choices): return group(*choices) + '?'
+def _combinations(*l):
+    return set(
+        x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
+    )
  
  Whitespace = r'[ \f\t]*'
  Comment = r'#[^\r\n]*'
  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'\w+'  # this is invalid but it's fine because Name comes after Number in all groups
  
  Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@@ -74,7 +78,7 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  # Tail end of """ string.
  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
+_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  Triple = group(_litprefix + "'''", _litprefix + '"""')
  # Single-line ' or " string.
  String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
@@ -103,61 +107,33 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  
-tokenprog, pseudoprog, single3prog, double3prog = list(map(
-    re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
+
+_strprefixes = (
+    _combinations('r', 'R', 'f', 'F') |
+    _combinations('r', 'R', 'b', 'B') |
+    {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
+)
+
  endprogs = {"'": re.compile(Single), '"': re.compile(Double),
              "'''": single3prog, '"""': double3prog,
-            "r'''": single3prog, 'r"""': double3prog,
-            "u'''": single3prog, 'u"""': double3prog,
-            "b'''": single3prog, 'b"""': double3prog,
-            "f'''": single3prog, 'f"""': double3prog,
-            "ur'''": single3prog, 'ur"""': double3prog,
-            "br'''": single3prog, 'br"""': double3prog,
-            "rb'''": single3prog, 'rb"""': double3prog,
-            "R'''": single3prog, 'R"""': double3prog,
-            "U'''": single3prog, 'U"""': double3prog,
-            "B'''": single3prog, 'B"""': double3prog,
-            "F'''": single3prog, 'F"""': double3prog,
-            "uR'''": single3prog, 'uR"""': double3prog,
-            "Ur'''": single3prog, 'Ur"""': double3prog,
-            "UR'''": single3prog, 'UR"""': double3prog,
-            "bR'''": single3prog, 'bR"""': double3prog,
-            "Br'''": single3prog, 'Br"""': double3prog,
-            "BR'''": single3prog, 'BR"""': double3prog,
-            "rB'''": single3prog, 'rB"""': double3prog,
-            "Rb'''": single3prog, 'Rb"""': double3prog,
-            "RB'''": single3prog, 'RB"""': double3prog,
-            'r': None, 'R': None,
-            'u': None, 'U': None,
-            'f': None, 'F': None,
-            'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "u'''", 'u"""', "U'''", 'U"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "f'''", 'f"""', "F'''", 'F"""',
-          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
-          "uR'''", 'uR"""', "UR'''", 'UR"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""',
-          "rb'''", 'rb"""', "Rb'''", 'Rb"""',
-          "rB'''", 'rB"""', "RB'''", 'RB"""',):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "u'", 'u"', "U'", 'U"',
-          "b'", 'b"', "B'", 'B"',
-          "f'", 'f"', "F'", 'F"',
-          "ur'", 'ur"', "Ur'", 'Ur"',
-          "uR'", 'uR"', "UR'", 'UR"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"',
-          "rb'", 'rb"', "Rb'", 'Rb"',
-          "rB'", 'rB"', "RB'", 'RB"',):
-    single_quoted[t] = t
+            **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+            **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+            **{prefix: None for prefix in _strprefixes}}
+
+triple_quoted = (
+    {"'''", '"""'} |
+    {f"{prefix}'''" for prefix in _strprefixes} |
+    {f'{prefix}"""' for prefix in _strprefixes}
+)
+single_quoted = (
+    {"'", '"'} |
+    {f"{prefix}'" for prefix in _strprefixes} |
+    {f'{prefix}"' for prefix in _strprefixes}
+)
  
  tabsize = 8
  
@@ -234,7 +210,7 @@ class Untokenizer:
          for tok in iterable:
              toknum, tokval = tok[:2]
  
-            if toknum in (NAME, NUMBER):
+            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
                  tokval += ' '
  
              if toknum == INDENT:
@@ -358,7 +334,7 @@ def untokenize(iterable):
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
-def generate_tokens(readline):
+def generate_tokens(readline, grammar=None):
      """
      The generate_tokens() generator requires one argument, readline, which
      must be a callable object which provides the same interface as the
@@ -375,11 +351,20 @@ def generate_tokens(readline):
      logical line; continuation lines are included.
      """
      lnum = parenlev = continued = 0
-    namechars, numchars = string.ascii_letters + '_', '0123456789'
+    numchars = '0123456789'
      contstr, needcont = '', 0
      contline = None
      indents = [0]
  
+    # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+    # `await` as keywords.
+    async_keywords = False if grammar is None else grammar.async_keywords
+    # 'stashed' and 'async_*' are used for async/await parsing
+    stashed = None
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
+
      while 1:                                   # loop over lines in stream
          try:
              line = readline()
@@ -420,31 +405,46 @@ def generate_tokens(readline):
                  pos = pos + 1
              if pos == max: break
  
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    nl_pos = pos + len(comment_token)
-                    yield (COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    yield (NL, line[nl_pos:],
-                           (lnum, nl_pos), (lnum, len(line)), line)
-                else:
-                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
+            if stashed:
+                yield stashed
+                stashed = None
+
+            if line[pos] in '\r\n':            # skip blank lines
+                yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                  continue
  
-            if column > indents[-1]:           # count indents or dedents
+            if line[pos] == '#':               # skip comments
+                comment_token = line[pos:].rstrip('\r\n')
+                nl_pos = pos + len(comment_token)
+                yield (COMMENT, comment_token,
+                        (lnum, pos), (lnum, pos + len(comment_token)), line)
+                yield (NL, line[nl_pos:],
+                        (lnum, nl_pos), (lnum, len(line)), line)
+                continue
+
+            if column > indents[-1]:           # count indents
                  indents.append(column)
                  yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
+
+            while column < indents[-1]:        # count dedents
                  if column not in indents:
                      raise IndentationError(
                          "unindent does not match any outer indentation level",
                          ("<tokenize>", lnum, pos, line))
                  indents = indents[:-1]
  
+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
+
                  yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
  
+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
          else:                                  # continued statement
              if not line:
                  raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -464,10 +464,18 @@ def generate_tokens(readline):
                      newline = NEWLINE
                      if parenlev > 0:
                          newline = NL
+                    elif async_def:
+                        async_def_nl = True
+                    if stashed:
+                        yield stashed
+                        stashed = None
                      yield (newline, token, spos, epos, line)
  
                  elif initial == '#':
                      assert not token.endswith("\n")
+                    if stashed:
+                        yield stashed
+                        stashed = None
                      yield (COMMENT, token, spos, epos, line)
                  elif token in triple_quoted:
                      endprog = endprogs[token]
@@ -475,6 +483,9 @@ def generate_tokens(readline):
                      if endmatch:                           # all on one line
                          pos = endmatch.end(0)
                          token = line[start:pos]
+                        if stashed:
+                            yield stashed
+                            stashed = None
                          yield (STRING, token, spos, (lnum, pos), line)
                      else:
                          strstart = (lnum, start)           # multiple lines
@@ -492,22 +503,64 @@ def generate_tokens(readline):
                          contline = line
                          break
                      else:                                  # ordinary string
+                        if stashed:
+                            yield stashed
+                            stashed = None
                          yield (STRING, token, spos, epos, line)
-                elif initial in namechars:                 # ordinary name
-                    yield (NAME, token, spos, epos, line)
+                elif initial.isidentifier():               # ordinary name
+                    if token in ('async', 'await'):
+                        if async_keywords or async_def:
+                            yield (ASYNC if token == 'async' else AWAIT,
+                                   token, spos, epos, line)
+                            continue
+
+                    tok = (NAME, token, spos, epos, line)
+                    if token == 'async' and not stashed:
+                        stashed = tok
+                        continue
+
+                    if token in ('def', 'for'):
+                        if (stashed
+                                and stashed[0] == NAME
+                                and stashed[1] == 'async'):
+
+                            if token == 'def':
+                                async_def = True
+                                async_def_indent = indents[-1]
+
+                            yield (ASYNC, stashed[1],
+                                   stashed[2], stashed[3],
+                                   stashed[4])
+                            stashed = None
+
+                    if stashed:
+                        yield stashed
+                        stashed = None
+
+                    yield tok
                  elif initial == '\\':                      # continued stmt
                      # This yield is new; needed for better idempotency:
+                    if stashed:
+                        yield stashed
+                        stashed = None
                      yield (NL, token, spos, (lnum, pos), line)
                      continued = 1
                  else:
                      if initial in '([{': parenlev = parenlev + 1
                      elif initial in ')]}': parenlev = parenlev - 1
+                    if stashed:
+                        yield stashed
+                        stashed = None
                      yield (OP, token, spos, epos, line)
              else:
                  yield (ERRORTOKEN, line[pos],
                             (lnum, pos), (lnum, pos+1), line)
                  pos = pos + 1
  
+    if stashed:
+        yield stashed
+        stashed = None
+
      for indent in indents[1:]:                 # pop remaining indent levels
          yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
      yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')