]> git.madduck.net Git - etc/vim.git/commitdiff

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Speed up new backtracking parser (#2728)
authorBatuhan Taskaya <isidentical@gmail.com>
Mon, 10 Jan 2022 18:22:00 +0000 (21:22 +0300)
committerGitHub <noreply@github.com>
Mon, 10 Jan 2022 18:22:00 +0000 (10:22 -0800)
CHANGES.md
src/blib2to3/pgen2/parse.py
tests/data/pattern_matching_generic.py [new file with mode: 0644]
tests/test_format.py

index f6e8343ed00c561b64a4ca8ab1eaebc25305aba1..a1c8ccb0b7d739b5449b2f21c0ffab5510c11221 100644 (file)
@@ -24,6 +24,8 @@
   at least one pre-existing blank line (#2736)
 - Verbose mode also now describes how a project root was discovered and which paths will
   be formatted. (#2526)
+- Speed-up the new backtracking parser about 4X in general (enabled when
+  `--target-version` is set to 3.10 and higher). (#2728)
 
 ### Packaging
 
index e5dad3ae76697124627e40c6601f0ef0bec97e5a..8fe9667289796bf16d2186b17a81a3b8cf1a6ece 100644 (file)
@@ -46,6 +46,17 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL:
     return Node(type=node[0], children=node[3], context=node[2])
 
 
+# A placeholder node, used when parser is backtracking.
+DUMMY_NODE = (-1, None, None, None)
+
+
+def stack_copy(
+    stack: List[Tuple[DFAS, int, RawNode]]
+) -> List[Tuple[DFAS, int, RawNode]]:
+    """Nodeless stack copy."""
+    return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack]
+
+
 class Recorder:
     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
         self.parser = parser
@@ -54,7 +65,7 @@ class Recorder:
 
         self._dead_ilabels: Set[int] = set()
         self._start_point = self.parser.stack
-        self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
+        self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
 
     @property
     def ilabels(self) -> Set[int]:
@@ -62,13 +73,32 @@ class Recorder:
 
     @contextmanager
     def switch_to(self, ilabel: int) -> Iterator[None]:
-        self.parser.stack = self._points[ilabel]
+        with self.backtrack():
+            self.parser.stack = self._points[ilabel]
+            try:
+                yield
+            except ParseError:
+                self._dead_ilabels.add(ilabel)
+            finally:
+                self.parser.stack = self._start_point
+
+    @contextmanager
+    def backtrack(self) -> Iterator[None]:
+        """
+        Use the node-level invariant ones for basic parsing operations (push/pop/shift).
+        These still will operate on the stack; but they won't create any new nodes, or
+        modify the contents of any other existing nodes.
+
+        This saves us a ton of time when we are backtracking, since we
+        want to restore to the initial state as quick as possible, which
+        can only be done by having as little mutatations as possible.
+        """
+        is_backtracking = self.parser.is_backtracking
         try:
+            self.parser.is_backtracking = True
             yield
-        except ParseError:
-            self._dead_ilabels.add(ilabel)
         finally:
-            self.parser.stack = self._start_point
+            self.parser.is_backtracking = is_backtracking
 
     def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
         func: Callable[..., Any]
@@ -179,6 +209,7 @@ class Parser(object):
         self.grammar = grammar
         # See note in docstring above. TL;DR this is ignored.
         self.convert = convert or lam_sub
+        self.is_backtracking = False
 
     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
         """Prepare for parsing.
@@ -319,28 +350,40 @@ class Parser(object):
 
     def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
         """Shift a token.  (Internal)"""
-        dfa, state, node = self.stack[-1]
-        rawnode: RawNode = (type, value, context, None)
-        newnode = convert(self.grammar, rawnode)
-        assert node[-1] is not None
-        node[-1].append(newnode)
-        self.stack[-1] = (dfa, newstate, node)
+        if self.is_backtracking:
+            dfa, state, _ = self.stack[-1]
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
+        else:
+            dfa, state, node = self.stack[-1]
+            rawnode: RawNode = (type, value, context, None)
+            newnode = convert(self.grammar, rawnode)
+            assert node[-1] is not None
+            node[-1].append(newnode)
+            self.stack[-1] = (dfa, newstate, node)
 
     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
         """Push a nonterminal.  (Internal)"""
-        dfa, state, node = self.stack[-1]
-        newnode: RawNode = (type, None, context, [])
-        self.stack[-1] = (dfa, newstate, node)
-        self.stack.append((newdfa, 0, newnode))
+        if self.is_backtracking:
+            dfa, state, _ = self.stack[-1]
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
+            self.stack.append((newdfa, 0, DUMMY_NODE))
+        else:
+            dfa, state, node = self.stack[-1]
+            newnode: RawNode = (type, None, context, [])
+            self.stack[-1] = (dfa, newstate, node)
+            self.stack.append((newdfa, 0, newnode))
 
     def pop(self) -> None:
         """Pop a nonterminal.  (Internal)"""
-        popdfa, popstate, popnode = self.stack.pop()
-        newnode = convert(self.grammar, popnode)
-        if self.stack:
-            dfa, state, node = self.stack[-1]
-            assert node[-1] is not None
-            node[-1].append(newnode)
+        if self.is_backtracking:
+            self.stack.pop()
         else:
-            self.rootnode = newnode
-            self.rootnode.used_names = self.used_names
+            popdfa, popstate, popnode = self.stack.pop()
+            newnode = convert(self.grammar, popnode)
+            if self.stack:
+                dfa, state, node = self.stack[-1]
+                assert node[-1] is not None
+                node[-1].append(newnode)
+            else:
+                self.rootnode = newnode
+                self.rootnode.used_names = self.used_names
diff --git a/tests/data/pattern_matching_generic.py b/tests/data/pattern_matching_generic.py
new file mode 100644 (file)
index 0000000..00a0e4a
--- /dev/null
@@ -0,0 +1,107 @@
+re.match()
+match = a
+with match() as match:
+    match = f"{match}"
+
+re.match()
+match = a
+with match() as match:
+    match = f"{match}"
+
+
+def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
+    if not target_versions:
+        # No target_version specified, so try all grammars.
+        return [
+            # Python 3.7+
+            pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords,
+            # Python 3.0-3.6
+            pygram.python_grammar_no_print_statement_no_exec_statement,
+            # Python 2.7 with future print_function import
+            pygram.python_grammar_no_print_statement,
+            # Python 2.7
+            pygram.python_grammar,
+        ]
+
+    match match:
+        case case:
+            match match:
+                case case:
+                    pass
+
+    if all(version.is_python2() for version in target_versions):
+        # Python 2-only code, so try Python 2 grammars.
+        return [
+            # Python 2.7 with future print_function import
+            pygram.python_grammar_no_print_statement,
+            # Python 2.7
+            pygram.python_grammar,
+        ]
+
+    re.match()
+    match = a
+    with match() as match:
+        match = f"{match}"
+
+    def test_patma_139(self):
+        x = False
+        match x:
+            case bool(z):
+                y = 0
+        self.assertIs(x, False)
+        self.assertEqual(y, 0)
+        self.assertIs(z, x)
+
+    # Python 3-compatible code, so only try Python 3 grammar.
+    grammars = []
+    if supports_feature(target_versions, Feature.PATTERN_MATCHING):
+        # Python 3.10+
+        grammars.append(pygram.python_grammar_soft_keywords)
+    # If we have to parse both, try to parse async as a keyword first
+    if not supports_feature(
+        target_versions, Feature.ASYNC_IDENTIFIERS
+    ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
+        # Python 3.7-3.9
+        grammars.append(
+            pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords
+        )
+    if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
+        # Python 3.0-3.6
+        grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement)
+
+    def test_patma_155(self):
+        x = 0
+        y = None
+        match x:
+            case 1e1000:
+                y = 0
+        self.assertEqual(x, 0)
+        self.assertIs(y, None)
+
+        x = range(3)
+        match x:
+            case [y, case as x, z]:
+                w = 0
+
+    # At least one of the above branches must have been taken, because every Python
+    # version has exactly one of the two 'ASYNC_*' flags
+    return grammars
+
+
+def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
+    """Given a string with source, return the lib2to3 Node."""
+    if not src_txt.endswith("\n"):
+        src_txt += "\n"
+
+    grammars = get_grammars(set(target_versions))
+
+
+re.match()
+match = a
+with match() as match:
+    match = f"{match}"
+
+re.match()
+match = a
+with match() as match:
+    match = f"{match}"
index 6651272a87cf91ef3a3ea3f04283aa30e44f8616..db39678cdfe57e41c5f29641ca18e935b335dc74 100644 (file)
@@ -69,6 +69,7 @@ PY310_CASES = [
     "pattern_matching_complex",
     "pattern_matching_extras",
     "pattern_matching_style",
+    "pattern_matching_generic",
     "parenthesized_context_managers",
 ]