src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 from contextlib import contextmanager
  13
  14 # Local imports
  15 from . import grammar, token, tokenize
  16 from typing import (
  17     cast,
  18     Any,
  19     Optional,
  20     Union,
  21     Tuple,
  22     Dict,
  23     List,
  24     Iterator,
  25     Callable,
  26     Set,
  27     TYPE_CHECKING,
  28 )
  29 from blib2to3.pgen2.grammar import Grammar
  30 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
  31
  32 if TYPE_CHECKING:
  33     from blib2to3.pgen2.driver import TokenProxy
  34
  35
  36 Results = Dict[str, NL]
  37 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  38 DFA = List[List[Tuple[int, int]]]
  39 DFAS = Tuple[DFA, Dict[int, int]]
  40
  41
  42 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  43     assert node[3] is not None
  44     return Node(type=node[0], children=node[3], context=node[2])
  45
  46
  47 # A placeholder node, used when parser is backtracking.
  48 DUMMY_NODE = (-1, None, None, None)
  49
  50
  51 def stack_copy(
  52     stack: List[Tuple[DFAS, int, RawNode]]
  53 ) -> List[Tuple[DFAS, int, RawNode]]:
  54     """Nodeless stack copy."""
  55     return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
  56
  57
  58 class Recorder:
  59     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  60         self.parser = parser
  61         self._ilabels = ilabels
  62         self.context = context  # not really matter
  63
  64         self._dead_ilabels: Set[int] = set()
  65         self._start_point = self.parser.stack
  66         self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
  67
  68     @property
  69     def ilabels(self) -> Set[int]:
  70         return self._dead_ilabels.symmetric_difference(self._ilabels)
  71
  72     @contextmanager
  73     def switch_to(self, ilabel: int) -> Iterator[None]:
  74         with self.backtrack():
  75             self.parser.stack = self._points[ilabel]
  76             try:
  77                 yield
  78             except ParseError:
  79                 self._dead_ilabels.add(ilabel)
  80             finally:
  81                 self.parser.stack = self._start_point
  82
  83     @contextmanager
  84     def backtrack(self) -> Iterator[None]:
  85         """
  86         Use the node-level invariant ones for basic parsing operations (push/pop/shift).
  87         These still will operate on the stack; but they won't create any new nodes, or
  88         modify the contents of any other existing nodes.
  89
  90         This saves us a ton of time when we are backtracking, since we
  91         want to restore to the initial state as quick as possible, which
  92         can only be done by having as little mutatations as possible.
  93         """
  94         is_backtracking = self.parser.is_backtracking
  95         try:
  96             self.parser.is_backtracking = True
  97             yield
  98         finally:
  99             self.parser.is_backtracking = is_backtracking
 100
 101     def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
 102         func: Callable[..., Any]
 103         if raw:
 104             func = self.parser._addtoken
 105         else:
 106             func = self.parser.addtoken
 107
 108         for ilabel in self.ilabels:
 109             with self.switch_to(ilabel):
 110                 args = [tok_type, tok_val, self.context]
 111                 if raw:
 112                     args.insert(0, ilabel)
 113                 func(*args)
 114
 115     def determine_route(self, value: Optional[str] = None, force: bool = False) -> Optional[int]:
 116         alive_ilabels = self.ilabels
 117         if len(alive_ilabels) == 0:
 118             *_, most_successful_ilabel = self._dead_ilabels
 119             raise ParseError("bad input", most_successful_ilabel, value, self.context)
 120
 121         ilabel, *rest = alive_ilabels
 122         if force or not rest:
 123             return ilabel
 124         else:
 125             return None
 126
 127
 128 class ParseError(Exception):
 129     """Exception to signal the parser is stuck."""
 130
 131     def __init__(
 132         self, msg: str, type: Optional[int], value: Optional[str], context: Context
 133     ) -> None:
 134         Exception.__init__(
 135             self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
 136         )
 137         self.msg = msg
 138         self.type = type
 139         self.value = value
 140         self.context = context
 141
 142
 143 class Parser:
 144     """Parser engine.
 145
 146     The proper usage sequence is:
 147
 148     p = Parser(grammar, [converter])  # create instance
 149     p.setup([start])                  # prepare for parsing
 150     <for each input token>:
 151         if p.addtoken(...):           # parse a token; may raise ParseError
 152             break
 153     root = p.rootnode                 # root of abstract syntax tree
 154
 155     A Parser instance may be reused by calling setup() repeatedly.
 156
 157     A Parser instance contains state pertaining to the current token
 158     sequence, and should not be used concurrently by different threads
 159     to parse separate token sequences.
 160
 161     See driver.py for how to get input tokens by tokenizing a file or
 162     string.
 163
 164     Parsing is complete when addtoken() returns True; the root of the
 165     abstract syntax tree can then be retrieved from the rootnode
 166     instance variable.  When a syntax error occurs, addtoken() raises
 167     the ParseError exception.  There is no error recovery; the parser
 168     cannot be used after a syntax error was reported (but it can be
 169     reinitialized by calling setup()).
 170
 171     """
 172
 173     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 174         """Constructor.
 175
 176         The grammar argument is a grammar.Grammar instance; see the
 177         grammar module for more information.
 178
 179         The parser is not ready yet for parsing; you must call the
 180         setup() method to get it started.
 181
 182         The optional convert argument is a function mapping concrete
 183         syntax tree nodes to abstract syntax tree nodes.  If not
 184         given, no conversion is done and the syntax tree produced is
 185         the concrete syntax tree.  If given, it must be a function of
 186         two arguments, the first being the grammar (a grammar.Grammar
 187         instance), and the second being the concrete syntax tree node
 188         to be converted.  The syntax tree is converted from the bottom
 189         up.
 190
 191         **post-note: the convert argument is ignored since for Black's
 192         usage, convert will always be blib2to3.pytree.convert. Allowing
 193         this to be dynamic hurts mypyc's ability to use early binding.
 194         These docs are left for historical and informational value.
 195
 196         A concrete syntax tree node is a (type, value, context, nodes)
 197         tuple, where type is the node type (a token or symbol number),
 198         value is None for symbols and a string for tokens, context is
 199         None or an opaque value used for error reporting (typically a
 200         (lineno, offset) pair), and nodes is a list of children for
 201         symbols, and None for tokens.
 202
 203         An abstract syntax tree node may be anything; this is entirely
 204         up to the converter function.
 205
 206         """
 207         self.grammar = grammar
 208         # See note in docstring above. TL;DR this is ignored.
 209         self.convert = convert or lam_sub
 210         self.is_backtracking = False
 211
 212     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 213         """Prepare for parsing.
 214
 215         This *must* be called before starting to parse.
 216
 217         The optional argument is an alternative start symbol; it
 218         defaults to the grammar's start symbol.
 219
 220         You can use a Parser instance to parse any number of programs;
 221         each time you call setup() the parser is reset to an initial
 222         state determined by the (implicit or explicit) start symbol.
 223
 224         """
 225         if start is None:
 226             start = self.grammar.start
 227         # Each stack entry is a tuple: (dfa, state, node).
 228         # A node is a tuple: (type, value, context, children),
 229         # where children is a list of nodes or None, and context may be None.
 230         newnode: RawNode = (start, None, None, [])
 231         stackentry = (self.grammar.dfas[start], 0, newnode)
 232         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 233         self.rootnode: Optional[NL] = None
 234         self.used_names: Set[str] = set()
 235         self.proxy = proxy
 236
 237     def addtoken(self, type: int, value: str, context: Context) -> bool:
 238         """Add a token; return True iff this is the end of the program."""
 239         # Map from token to label
 240         ilabels = self.classify(type, value, context)
 241         assert len(ilabels) >= 1
 242
 243         # If we have only one state to advance, we'll directly
 244         # take it as is.
 245         if len(ilabels) == 1:
 246             [ilabel] = ilabels
 247             return self._addtoken(ilabel, type, value, context)
 248
 249         # If there are multiple states which we can advance (only
 250         # happen under soft-keywords), then we will try all of them
 251         # in parallel and as soon as one state can reach further than
 252         # the rest, we'll choose that one. This is a pretty hacky
 253         # and hopefully temporary algorithm.
 254         #
 255         # For a more detailed explanation, check out this post:
 256         # https://tree.science/what-the-backtracking.html
 257
 258         with self.proxy.release() as proxy:
 259             counter, force = 0, False
 260             recorder = Recorder(self, ilabels, context)
 261             recorder.add_token(type, value, raw=True)
 262
 263             next_token_value = value
 264             while recorder.determine_route(next_token_value) is None:
 265                 if not proxy.can_advance(counter):
 266                     force = True
 267                     break
 268
 269                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 270                 if next_token_type in (tokenize.COMMENT, tokenize.NL):
 271                     counter += 1
 272                     continue
 273
 274                 if next_token_type == tokenize.OP:
 275                     next_token_type = grammar.opmap[next_token_value]
 276
 277                 recorder.add_token(next_token_type, next_token_value)
 278                 counter += 1
 279
 280             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 281             assert ilabel is not None
 282
 283         return self._addtoken(ilabel, type, value, context)
 284
 285     def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
 286         # Loop until the token is shifted; may raise exceptions
 287         while True:
 288             dfa, state, node = self.stack[-1]
 289             states, first = dfa
 290             arcs = states[state]
 291             # Look for a state with this label
 292             for i, newstate in arcs:
 293                 t = self.grammar.labels[i][0]
 294                 if t >= 256:
 295                     # See if it's a symbol and if we're in its first set
 296                     itsdfa = self.grammar.dfas[t]
 297                     itsstates, itsfirst = itsdfa
 298                     if ilabel in itsfirst:
 299                         # Push a symbol
 300                         self.push(t, itsdfa, newstate, context)
 301                         break  # To continue the outer while loop
 302
 303                 elif ilabel == i:
 304                     # Look it up in the list of labels
 305                     # Shift a token; we're done with it
 306                     self.shift(type, value, newstate, context)
 307                     # Pop while we are in an accept-only state
 308                     state = newstate
 309                     while states[state] == [(0, state)]:
 310                         self.pop()
 311                         if not self.stack:
 312                             # Done parsing!
 313                             return True
 314                         dfa, state, node = self.stack[-1]
 315                         states, first = dfa
 316                     # Done with this token
 317                     return False
 318
 319             else:
 320                 if (0, state) in arcs:
 321                     # An accepting state, pop it and try something else
 322                     self.pop()
 323                     if not self.stack:
 324                         # Done parsing, but another token is input
 325                         raise ParseError("too much input", type, value, context)
 326                 else:
 327                     # No success finding a transition
 328                     raise ParseError("bad input", type, value, context)
 329
 330     def classify(self, type: int, value: str, context: Context) -> List[int]:
 331         """Turn a token into a label.  (Internal)
 332
 333         Depending on whether the value is a soft-keyword or not,
 334         this function may return multiple labels to choose from."""
 335         if type == token.NAME:
 336             # Keep a listing of all used names
 337             self.used_names.add(value)
 338             # Check for reserved words
 339             if value in self.grammar.keywords:
 340                 return [self.grammar.keywords[value]]
 341             elif value in self.grammar.soft_keywords:
 342                 assert type in self.grammar.tokens
 343                 return [
 344                     self.grammar.soft_keywords[value],
 345                     self.grammar.tokens[type],
 346                 ]
 347
 348         ilabel = self.grammar.tokens.get(type)
 349         if ilabel is None:
 350             raise ParseError("bad token", type, value, context)
 351         return [ilabel]
 352
 353     def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
 354         """Shift a token.  (Internal)"""
 355         if self.is_backtracking:
 356             dfa, state, _ = self.stack[-1]
 357             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 358         else:
 359             dfa, state, node = self.stack[-1]
 360             rawnode: RawNode = (type, value, context, None)
 361             newnode = convert(self.grammar, rawnode)
 362             assert node[-1] is not None
 363             node[-1].append(newnode)
 364             self.stack[-1] = (dfa, newstate, node)
 365
 366     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 367         """Push a nonterminal.  (Internal)"""
 368         if self.is_backtracking:
 369             dfa, state, _ = self.stack[-1]
 370             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 371             self.stack.append((newdfa, 0, DUMMY_NODE))
 372         else:
 373             dfa, state, node = self.stack[-1]
 374             newnode: RawNode = (type, None, context, [])
 375             self.stack[-1] = (dfa, newstate, node)
 376             self.stack.append((newdfa, 0, newnode))
 377
 378     def pop(self) -> None:
 379         """Pop a nonterminal.  (Internal)"""
 380         if self.is_backtracking:
 381             self.stack.pop()
 382         else:
 383             popdfa, popstate, popnode = self.stack.pop()
 384             newnode = convert(self.grammar, popnode)
 385             if self.stack:
 386                 dfa, state, node = self.stack[-1]
 387                 assert node[-1] is not None
 388                 node[-1].append(newnode)
 389             else:
 390                 self.rootnode = newnode
 391                 self.rootnode.used_names = self.used_names