+++ /dev/null
-# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
-# Licensed to PSF under a Contributor Agreement.
-
-"""Parser engine for the grammar tables generated by pgen.
-
-The grammar table must be loaded first.
-
-See Parser/parser.c in the Python distribution for additional info on
-how this parsing engine works.
-
-"""
-from contextlib import contextmanager
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Dict,
- Iterator,
- List,
- Optional,
- Set,
- Tuple,
- Union,
- cast,
-)
-
-from blib2to3.pgen2.grammar import Grammar
-from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
-
-# Local imports
-from . import grammar, token, tokenize
-
-if TYPE_CHECKING:
- from blib2to3.pgen2.driver import TokenProxy
-
-
-Results = Dict[str, NL]
-Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
-DFA = List[List[Tuple[int, int]]]
-DFAS = Tuple[DFA, Dict[int, int]]
-
-
-def lam_sub(grammar: Grammar, node: RawNode) -> NL:
- assert node[3] is not None
- return Node(type=node[0], children=node[3], context=node[2])
-
-
-# A placeholder node, used when parser is backtracking.
-DUMMY_NODE = (-1, None, None, None)
-
-
-def stack_copy(
- stack: List[Tuple[DFAS, int, RawNode]]
-) -> List[Tuple[DFAS, int, RawNode]]:
- """Nodeless stack copy."""
- return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
-
-
-class Recorder:
- def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
- self.parser = parser
- self._ilabels = ilabels
- self.context = context # not really matter
-
- self._dead_ilabels: Set[int] = set()
- self._start_point = self.parser.stack
- self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
-
- @property
- def ilabels(self) -> Set[int]:
- return self._dead_ilabels.symmetric_difference(self._ilabels)
-
- @contextmanager
- def switch_to(self, ilabel: int) -> Iterator[None]:
- with self.backtrack():
- self.parser.stack = self._points[ilabel]
- try:
- yield
- except ParseError:
- self._dead_ilabels.add(ilabel)
- finally:
- self.parser.stack = self._start_point
-
- @contextmanager
- def backtrack(self) -> Iterator[None]:
- """
- Use the node-level invariant ones for basic parsing operations (push/pop/shift).
- These still will operate on the stack; but they won't create any new nodes, or
- modify the contents of any other existing nodes.
-
- This saves us a ton of time when we are backtracking, since we
- want to restore to the initial state as quick as possible, which
- can only be done by having as little mutatations as possible.
- """
- is_backtracking = self.parser.is_backtracking
- try:
- self.parser.is_backtracking = True
- yield
- finally:
- self.parser.is_backtracking = is_backtracking
-
- def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
- func: Callable[..., Any]
- if raw:
- func = self.parser._addtoken
- else:
- func = self.parser.addtoken
-
- for ilabel in self.ilabels:
- with self.switch_to(ilabel):
- args = [tok_type, tok_val, self.context]
- if raw:
- args.insert(0, ilabel)
- func(*args)
-
- def determine_route(
- self, value: Optional[str] = None, force: bool = False
- ) -> Optional[int]:
- alive_ilabels = self.ilabels
- if len(alive_ilabels) == 0:
- *_, most_successful_ilabel = self._dead_ilabels
- raise ParseError("bad input", most_successful_ilabel, value, self.context)
-
- ilabel, *rest = alive_ilabels
- if force or not rest:
- return ilabel
- else:
- return None
-
-
-class ParseError(Exception):
- """Exception to signal the parser is stuck."""
-
- def __init__(
- self, msg: str, type: Optional[int], value: Optional[str], context: Context
- ) -> None:
- Exception.__init__(
- self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
- )
- self.msg = msg
- self.type = type
- self.value = value
- self.context = context
-
-
-class Parser:
- """Parser engine.
-
- The proper usage sequence is:
-
- p = Parser(grammar, [converter]) # create instance
- p.setup([start]) # prepare for parsing
- <for each input token>:
- if p.addtoken(...): # parse a token; may raise ParseError
- break
- root = p.rootnode # root of abstract syntax tree
-
- A Parser instance may be reused by calling setup() repeatedly.
-
- A Parser instance contains state pertaining to the current token
- sequence, and should not be used concurrently by different threads
- to parse separate token sequences.
-
- See driver.py for how to get input tokens by tokenizing a file or
- string.
-
- Parsing is complete when addtoken() returns True; the root of the
- abstract syntax tree can then be retrieved from the rootnode
- instance variable. When a syntax error occurs, addtoken() raises
- the ParseError exception. There is no error recovery; the parser
- cannot be used after a syntax error was reported (but it can be
- reinitialized by calling setup()).
-
- """
-
- def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
- """Constructor.
-
- The grammar argument is a grammar.Grammar instance; see the
- grammar module for more information.
-
- The parser is not ready yet for parsing; you must call the
- setup() method to get it started.
-
- The optional convert argument is a function mapping concrete
- syntax tree nodes to abstract syntax tree nodes. If not
- given, no conversion is done and the syntax tree produced is
- the concrete syntax tree. If given, it must be a function of
- two arguments, the first being the grammar (a grammar.Grammar
- instance), and the second being the concrete syntax tree node
- to be converted. The syntax tree is converted from the bottom
- up.
-
- **post-note: the convert argument is ignored since for Black's
- usage, convert will always be blib2to3.pytree.convert. Allowing
- this to be dynamic hurts mypyc's ability to use early binding.
- These docs are left for historical and informational value.
-
- A concrete syntax tree node is a (type, value, context, nodes)
- tuple, where type is the node type (a token or symbol number),
- value is None for symbols and a string for tokens, context is
- None or an opaque value used for error reporting (typically a
- (lineno, offset) pair), and nodes is a list of children for
- symbols, and None for tokens.
-
- An abstract syntax tree node may be anything; this is entirely
- up to the converter function.
-
- """
- self.grammar = grammar
- # See note in docstring above. TL;DR this is ignored.
- self.convert = convert or lam_sub
- self.is_backtracking = False
- self.last_token: Optional[int] = None
-
- def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
- """Prepare for parsing.
-
- This *must* be called before starting to parse.
-
- The optional argument is an alternative start symbol; it
- defaults to the grammar's start symbol.
-
- You can use a Parser instance to parse any number of programs;
- each time you call setup() the parser is reset to an initial
- state determined by the (implicit or explicit) start symbol.
-
- """
- if start is None:
- start = self.grammar.start
- # Each stack entry is a tuple: (dfa, state, node).
- # A node is a tuple: (type, value, context, children),
- # where children is a list of nodes or None, and context may be None.
- newnode: RawNode = (start, None, None, [])
- stackentry = (self.grammar.dfas[start], 0, newnode)
- self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
- self.rootnode: Optional[NL] = None
- self.used_names: Set[str] = set()
- self.proxy = proxy
- self.last_token = None
-
- def addtoken(self, type: int, value: str, context: Context) -> bool:
- """Add a token; return True iff this is the end of the program."""
- # Map from token to label
- ilabels = self.classify(type, value, context)
- assert len(ilabels) >= 1
-
- # If we have only one state to advance, we'll directly
- # take it as is.
- if len(ilabels) == 1:
- [ilabel] = ilabels
- return self._addtoken(ilabel, type, value, context)
-
- # If there are multiple states which we can advance (only
- # happen under soft-keywords), then we will try all of them
- # in parallel and as soon as one state can reach further than
- # the rest, we'll choose that one. This is a pretty hacky
- # and hopefully temporary algorithm.
- #
- # For a more detailed explanation, check out this post:
- # https://tree.science/what-the-backtracking.html
-
- with self.proxy.release() as proxy:
- counter, force = 0, False
- recorder = Recorder(self, ilabels, context)
- recorder.add_token(type, value, raw=True)
-
- next_token_value = value
- while recorder.determine_route(next_token_value) is None:
- if not proxy.can_advance(counter):
- force = True
- break
-
- next_token_type, next_token_value, *_ = proxy.eat(counter)
- if next_token_type in (tokenize.COMMENT, tokenize.NL):
- counter += 1
- continue
-
- if next_token_type == tokenize.OP:
- next_token_type = grammar.opmap[next_token_value]
-
- recorder.add_token(next_token_type, next_token_value)
- counter += 1
-
- ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
- assert ilabel is not None
-
- return self._addtoken(ilabel, type, value, context)
-
- def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
- # Loop until the token is shifted; may raise exceptions
- while True:
- dfa, state, node = self.stack[-1]
- states, first = dfa
- arcs = states[state]
- # Look for a state with this label
- for i, newstate in arcs:
- t = self.grammar.labels[i][0]
- if t >= 256:
- # See if it's a symbol and if we're in its first set
- itsdfa = self.grammar.dfas[t]
- itsstates, itsfirst = itsdfa
- if ilabel in itsfirst:
- # Push a symbol
- self.push(t, itsdfa, newstate, context)
- break # To continue the outer while loop
-
- elif ilabel == i:
- # Look it up in the list of labels
- # Shift a token; we're done with it
- self.shift(type, value, newstate, context)
- # Pop while we are in an accept-only state
- state = newstate
- while states[state] == [(0, state)]:
- self.pop()
- if not self.stack:
- # Done parsing!
- return True
- dfa, state, node = self.stack[-1]
- states, first = dfa
- # Done with this token
- self.last_token = type
- return False
-
- else:
- if (0, state) in arcs:
- # An accepting state, pop it and try something else
- self.pop()
- if not self.stack:
- # Done parsing, but another token is input
- raise ParseError("too much input", type, value, context)
- else:
- # No success finding a transition
- raise ParseError("bad input", type, value, context)
-
- def classify(self, type: int, value: str, context: Context) -> List[int]:
- """Turn a token into a label. (Internal)
-
- Depending on whether the value is a soft-keyword or not,
- this function may return multiple labels to choose from."""
- if type == token.NAME:
- # Keep a listing of all used names
- self.used_names.add(value)
- # Check for reserved words
- if value in self.grammar.keywords:
- return [self.grammar.keywords[value]]
- elif value in self.grammar.soft_keywords:
- assert type in self.grammar.tokens
- # Current soft keywords (match, case, type) can only appear at the
- # beginning of a statement. So as a shortcut, don't try to treat them
- # like keywords in any other context.
- # ('_' is also a soft keyword in the real grammar, but for our grammar
- # it's just an expression, so we don't need to treat it specially.)
- if self.last_token not in (
- None,
- token.INDENT,
- token.DEDENT,
- token.NEWLINE,
- token.SEMI,
- token.COLON,
- ):
- return [self.grammar.tokens[type]]
- return [
- self.grammar.tokens[type],
- self.grammar.soft_keywords[value],
- ]
-
- ilabel = self.grammar.tokens.get(type)
- if ilabel is None:
- raise ParseError("bad token", type, value, context)
- return [ilabel]
-
- def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
- """Shift a token. (Internal)"""
- if self.is_backtracking:
- dfa, state, _ = self.stack[-1]
- self.stack[-1] = (dfa, newstate, DUMMY_NODE)
- else:
- dfa, state, node = self.stack[-1]
- rawnode: RawNode = (type, value, context, None)
- newnode = convert(self.grammar, rawnode)
- assert node[-1] is not None
- node[-1].append(newnode)
- self.stack[-1] = (dfa, newstate, node)
-
- def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
- """Push a nonterminal. (Internal)"""
- if self.is_backtracking:
- dfa, state, _ = self.stack[-1]
- self.stack[-1] = (dfa, newstate, DUMMY_NODE)
- self.stack.append((newdfa, 0, DUMMY_NODE))
- else:
- dfa, state, node = self.stack[-1]
- newnode: RawNode = (type, None, context, [])
- self.stack[-1] = (dfa, newstate, node)
- self.stack.append((newdfa, 0, newnode))
-
- def pop(self) -> None:
- """Pop a nonterminal. (Internal)"""
- if self.is_backtracking:
- self.stack.pop()
- else:
- popdfa, popstate, popnode = self.stack.pop()
- newnode = convert(self.grammar, popnode)
- if self.stack:
- dfa, state, node = self.stack[-1]
- assert node[-1] is not None
- node[-1].append(newnode)
- else:
- self.rootnode = newnode
- self.rootnode.used_names = self.used_names