]> git.madduck.net Git - etc/vim.git/blobdiff - src/blib2to3/pgen2/parse.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Improve performance by skipping unnecessary normalisation (#3751)
[etc/vim.git] / src / blib2to3 / pgen2 / parse.py
index 8c374d35b421c8a49ccd6b9b1309c6cf323111f9..17bf118e9fcd8053b655b4efc42e26f840b40465 100644 (file)
@@ -9,26 +9,31 @@ See Parser/parser.c in the Python distribution for additional info on
 how this parsing engine works.
 
 """
+from contextlib import contextmanager
 
 # Local imports
-from . import token
+from . import grammar, token, tokenize
 from typing import (
-    Optional,
-    Text,
-    Sequence,
+    cast,
     Any,
+    Optional,
     Union,
     Tuple,
     Dict,
     List,
+    Iterator,
     Callable,
     Set,
+    TYPE_CHECKING,
 )
 from blib2to3.pgen2.grammar import Grammar
-from blib2to3.pytree import NL, Context, RawNode, Leaf, Node
+from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
+
+if TYPE_CHECKING:
+    from blib2to3.pgen2.driver import TokenProxy
 
 
-Results = Dict[Text, NL]
+Results = Dict[str, NL]
 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
 DFA = List[List[Tuple[int, int]]]
 DFAS = Tuple[DFA, Dict[int, int]]
@@ -39,14 +44,95 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL:
     return Node(type=node[0], children=node[3], context=node[2])
 
 
+# A placeholder node, used when parser is backtracking.
+DUMMY_NODE = (-1, None, None, None)
+
+
+def stack_copy(
+    stack: List[Tuple[DFAS, int, RawNode]]
+) -> List[Tuple[DFAS, int, RawNode]]:
+    """Nodeless stack copy."""
+    return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
+
+
+class Recorder:
+    def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
+        self.parser = parser
+        self._ilabels = ilabels
+        self.context = context  # not really matter
+
+        self._dead_ilabels: Set[int] = set()
+        self._start_point = self.parser.stack
+        self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
+
+    @property
+    def ilabels(self) -> Set[int]:
+        return self._dead_ilabels.symmetric_difference(self._ilabels)
+
+    @contextmanager
+    def switch_to(self, ilabel: int) -> Iterator[None]:
+        with self.backtrack():
+            self.parser.stack = self._points[ilabel]
+            try:
+                yield
+            except ParseError:
+                self._dead_ilabels.add(ilabel)
+            finally:
+                self.parser.stack = self._start_point
+
+    @contextmanager
+    def backtrack(self) -> Iterator[None]:
+        """
+        Use the node-level invariant ones for basic parsing operations (push/pop/shift).
+        These still will operate on the stack; but they won't create any new nodes, or
+        modify the contents of any other existing nodes.
+
+        This saves us a ton of time when we are backtracking, since we
+        want to restore to the initial state as quick as possible, which
+        can only be done by having as little mutatations as possible.
+        """
+        is_backtracking = self.parser.is_backtracking
+        try:
+            self.parser.is_backtracking = True
+            yield
+        finally:
+            self.parser.is_backtracking = is_backtracking
+
+    def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
+        func: Callable[..., Any]
+        if raw:
+            func = self.parser._addtoken
+        else:
+            func = self.parser.addtoken
+
+        for ilabel in self.ilabels:
+            with self.switch_to(ilabel):
+                args = [tok_type, tok_val, self.context]
+                if raw:
+                    args.insert(0, ilabel)
+                func(*args)
+
+    def determine_route(self, value: Optional[str] = None, force: bool = False) -> Optional[int]:
+        alive_ilabels = self.ilabels
+        if len(alive_ilabels) == 0:
+            *_, most_successful_ilabel = self._dead_ilabels
+            raise ParseError("bad input", most_successful_ilabel, value, self.context)
+
+        ilabel, *rest = alive_ilabels
+        if force or not rest:
+            return ilabel
+        else:
+            return None
+
+
 class ParseError(Exception):
     """Exception to signal the parser is stuck."""
 
     def __init__(
-        self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
+        self, msg: str, type: Optional[int], value: Optional[str], context: Context
     ) -> None:
         Exception.__init__(
-            self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
+            self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
         )
         self.msg = msg
         self.type = type
@@ -54,7 +140,7 @@ class ParseError(Exception):
         self.context = context
 
 
-class Parser(object):
+class Parser:
     """Parser engine.
 
     The proper usage sequence is:
@@ -102,6 +188,11 @@ class Parser(object):
         to be converted.  The syntax tree is converted from the bottom
         up.
 
+        **post-note: the convert argument is ignored since for Black's
+        usage, convert will always be blib2to3.pytree.convert. Allowing
+        this to be dynamic hurts mypyc's ability to use early binding.
+        These docs are left for historical and informational value.
+
         A concrete syntax tree node is a (type, value, context, nodes)
         tuple, where type is the node type (a token or symbol number),
         value is None for symbols and a string for tokens, context is
@@ -114,9 +205,11 @@ class Parser(object):
 
         """
         self.grammar = grammar
+        # See note in docstring above. TL;DR this is ignored.
         self.convert = convert or lam_sub
+        self.is_backtracking = False
 
-    def setup(self, start: Optional[int] = None) -> None:
+    def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
         """Prepare for parsing.
 
         This *must* be called before starting to parse.
@@ -139,11 +232,57 @@ class Parser(object):
         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
         self.rootnode: Optional[NL] = None
         self.used_names: Set[str] = set()
+        self.proxy = proxy
 
-    def addtoken(self, type: int, value: Optional[Text], context: Context) -> bool:
+    def addtoken(self, type: int, value: str, context: Context) -> bool:
         """Add a token; return True iff this is the end of the program."""
         # Map from token to label
-        ilabel = self.classify(type, value, context)
+        ilabels = self.classify(type, value, context)
+        assert len(ilabels) >= 1
+
+        # If we have only one state to advance, we'll directly
+        # take it as is.
+        if len(ilabels) == 1:
+            [ilabel] = ilabels
+            return self._addtoken(ilabel, type, value, context)
+
+        # If there are multiple states which we can advance (only
+        # happen under soft-keywords), then we will try all of them
+        # in parallel and as soon as one state can reach further than
+        # the rest, we'll choose that one. This is a pretty hacky
+        # and hopefully temporary algorithm.
+        #
+        # For a more detailed explanation, check out this post:
+        # https://tree.science/what-the-backtracking.html
+
+        with self.proxy.release() as proxy:
+            counter, force = 0, False
+            recorder = Recorder(self, ilabels, context)
+            recorder.add_token(type, value, raw=True)
+
+            next_token_value = value
+            while recorder.determine_route(next_token_value) is None:
+                if not proxy.can_advance(counter):
+                    force = True
+                    break
+
+                next_token_type, next_token_value, *_ = proxy.eat(counter)
+                if next_token_type in (tokenize.COMMENT, tokenize.NL):
+                    counter += 1
+                    continue
+
+                if next_token_type == tokenize.OP:
+                    next_token_type = grammar.opmap[next_token_value]
+
+                recorder.add_token(next_token_type, next_token_value)
+                counter += 1
+
+            ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
+            assert ilabel is not None
+
+        return self._addtoken(ilabel, type, value, context)
+
+    def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
         # Loop until the token is shifted; may raise exceptions
         while True:
             dfa, state, node = self.stack[-1]
@@ -151,10 +290,18 @@ class Parser(object):
             arcs = states[state]
             # Look for a state with this label
             for i, newstate in arcs:
-                t, v = self.grammar.labels[i]
-                if ilabel == i:
+                t = self.grammar.labels[i][0]
+                if t >= 256:
+                    # See if it's a symbol and if we're in its first set
+                    itsdfa = self.grammar.dfas[t]
+                    itsstates, itsfirst = itsdfa
+                    if ilabel in itsfirst:
+                        # Push a symbol
+                        self.push(t, itsdfa, newstate, context)
+                        break  # To continue the outer while loop
+
+                elif ilabel == i:
                     # Look it up in the list of labels
-                    assert t < 256
                     # Shift a token; we're done with it
                     self.shift(type, value, newstate, context)
                     # Pop while we are in an accept-only state
@@ -168,14 +315,7 @@ class Parser(object):
                         states, first = dfa
                     # Done with this token
                     return False
-                elif t >= 256:
-                    # See if it's a symbol and if we're in its first set
-                    itsdfa = self.grammar.dfas[t]
-                    itsstates, itsfirst = itsdfa
-                    if ilabel in itsfirst:
-                        # Push a symbol
-                        self.push(t, self.grammar.dfas[t], newstate, context)
-                        break  # To continue the outer while loop
+
             else:
                 if (0, state) in arcs:
                     # An accepting state, pop it and try something else
@@ -187,47 +327,61 @@ class Parser(object):
                     # No success finding a transition
                     raise ParseError("bad input", type, value, context)
 
-    def classify(self, type: int, value: Optional[Text], context: Context) -> int:
-        """Turn a token into a label.  (Internal)"""
+    def classify(self, type: int, value: str, context: Context) -> List[int]:
+        """Turn a token into a label.  (Internal)
+
+        Depending on whether the value is a soft-keyword or not,
+        this function may return multiple labels to choose from."""
         if type == token.NAME:
             # Keep a listing of all used names
-            assert value is not None
             self.used_names.add(value)
             # Check for reserved words
-            ilabel = self.grammar.keywords.get(value)
-            if ilabel is not None:
-                return ilabel
+            if value in self.grammar.keywords:
+                return [self.grammar.keywords[value]]
+            elif value in self.grammar.soft_keywords:
+                assert type in self.grammar.tokens
+                return [
+                    self.grammar.soft_keywords[value],
+                    self.grammar.tokens[type],
+                ]
+
         ilabel = self.grammar.tokens.get(type)
         if ilabel is None:
             raise ParseError("bad token", type, value, context)
-        return ilabel
+        return [ilabel]
 
-    def shift(
-        self, type: int, value: Optional[Text], newstate: int, context: Context
-    ) -> None:
+    def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
         """Shift a token.  (Internal)"""
-        dfa, state, node = self.stack[-1]
-        assert value is not None
-        assert context is not None
-        rawnode: RawNode = (type, value, context, None)
-        newnode = self.convert(self.grammar, rawnode)
-        if newnode is not None:
+        if self.is_backtracking:
+            dfa, state, _ = self.stack[-1]
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
+        else:
+            dfa, state, node = self.stack[-1]
+            rawnode: RawNode = (type, value, context, None)
+            newnode = convert(self.grammar, rawnode)
             assert node[-1] is not None
             node[-1].append(newnode)
-        self.stack[-1] = (dfa, newstate, node)
+            self.stack[-1] = (dfa, newstate, node)
 
     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
         """Push a nonterminal.  (Internal)"""
-        dfa, state, node = self.stack[-1]
-        newnode: RawNode = (type, None, context, [])
-        self.stack[-1] = (dfa, newstate, node)
-        self.stack.append((newdfa, 0, newnode))
+        if self.is_backtracking:
+            dfa, state, _ = self.stack[-1]
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
+            self.stack.append((newdfa, 0, DUMMY_NODE))
+        else:
+            dfa, state, node = self.stack[-1]
+            newnode: RawNode = (type, None, context, [])
+            self.stack[-1] = (dfa, newstate, node)
+            self.stack.append((newdfa, 0, newnode))
 
     def pop(self) -> None:
         """Pop a nonterminal.  (Internal)"""
-        popdfa, popstate, popnode = self.stack.pop()
-        newnode = self.convert(self.grammar, popnode)
-        if newnode is not None:
+        if self.is_backtracking:
+            self.stack.pop()
+        else:
+            popdfa, popstate, popnode = self.stack.pop()
+            newnode = convert(self.grammar, popnode)
             if self.stack:
                 dfa, state, node = self.stack[-1]
                 assert node[-1] is not None