src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 import copy
  13 from contextlib import contextmanager
  14
  15 # Local imports
  16 from . import grammar, token, tokenize
  17 from typing import (
  18     cast,
  19     Any,
  20     Optional,
  21     Text,
  22     Union,
  23     Tuple,
  24     Dict,
  25     List,
  26     Iterator,
  27     Callable,
  28     Set,
  29     TYPE_CHECKING,
  30 )
  31 from blib2to3.pgen2.grammar import Grammar
  32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
  33
  34 if TYPE_CHECKING:
  35     from blib2to3.driver import TokenProxy
  36
  37
  38 Results = Dict[Text, NL]
  39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  40 DFA = List[List[Tuple[int, int]]]
  41 DFAS = Tuple[DFA, Dict[int, int]]
  42
  43
  44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  45     assert node[3] is not None
  46     return Node(type=node[0], children=node[3], context=node[2])
  47
  48
  49 # A placeholder node, used when parser is backtracking.
  50 DUMMY_NODE = (-1, None, None, None)
  51
  52
  53 def stack_copy(
  54     stack: List[Tuple[DFAS, int, RawNode]]
  55 ) -> List[Tuple[DFAS, int, RawNode]]:
  56     """Nodeless stack copy."""
  57     return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack]
  58
  59
  60 class Recorder:
  61     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  62         self.parser = parser
  63         self._ilabels = ilabels
  64         self.context = context  # not really matter
  65
  66         self._dead_ilabels: Set[int] = set()
  67         self._start_point = self.parser.stack
  68         self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
  69
  70     @property
  71     def ilabels(self) -> Set[int]:
  72         return self._dead_ilabels.symmetric_difference(self._ilabels)
  73
  74     @contextmanager
  75     def switch_to(self, ilabel: int) -> Iterator[None]:
  76         with self.backtrack():
  77             self.parser.stack = self._points[ilabel]
  78             try:
  79                 yield
  80             except ParseError:
  81                 self._dead_ilabels.add(ilabel)
  82             finally:
  83                 self.parser.stack = self._start_point
  84
  85     @contextmanager
  86     def backtrack(self) -> Iterator[None]:
  87         """
  88         Use the node-level invariant ones for basic parsing operations (push/pop/shift).
  89         These still will operate on the stack; but they won't create any new nodes, or
  90         modify the contents of any other existing nodes.
  91
  92         This saves us a ton of time when we are backtracking, since we
  93         want to restore to the initial state as quick as possible, which
  94         can only be done by having as little mutatations as possible.
  95         """
  96         is_backtracking = self.parser.is_backtracking
  97         try:
  98             self.parser.is_backtracking = True
  99             yield
 100         finally:
 101             self.parser.is_backtracking = is_backtracking
 102
 103     def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
 104         func: Callable[..., Any]
 105         if raw:
 106             func = self.parser._addtoken
 107         else:
 108             func = self.parser.addtoken
 109
 110         for ilabel in self.ilabels:
 111             with self.switch_to(ilabel):
 112                 args = [tok_type, tok_val, self.context]
 113                 if raw:
 114                     args.insert(0, ilabel)
 115                 func(*args)
 116
 117     def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
 118         alive_ilabels = self.ilabels
 119         if len(alive_ilabels) == 0:
 120             *_, most_successful_ilabel = self._dead_ilabels
 121             raise ParseError("bad input", most_successful_ilabel, value, self.context)
 122
 123         ilabel, *rest = alive_ilabels
 124         if force or not rest:
 125             return ilabel
 126         else:
 127             return None
 128
 129
 130 class ParseError(Exception):
 131     """Exception to signal the parser is stuck."""
 132
 133     def __init__(
 134         self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
 135     ) -> None:
 136         Exception.__init__(
 137             self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
 138         )
 139         self.msg = msg
 140         self.type = type
 141         self.value = value
 142         self.context = context
 143
 144
 145 class Parser(object):
 146     """Parser engine.
 147
 148     The proper usage sequence is:
 149
 150     p = Parser(grammar, [converter])  # create instance
 151     p.setup([start])                  # prepare for parsing
 152     <for each input token>:
 153         if p.addtoken(...):           # parse a token; may raise ParseError
 154             break
 155     root = p.rootnode                 # root of abstract syntax tree
 156
 157     A Parser instance may be reused by calling setup() repeatedly.
 158
 159     A Parser instance contains state pertaining to the current token
 160     sequence, and should not be used concurrently by different threads
 161     to parse separate token sequences.
 162
 163     See driver.py for how to get input tokens by tokenizing a file or
 164     string.
 165
 166     Parsing is complete when addtoken() returns True; the root of the
 167     abstract syntax tree can then be retrieved from the rootnode
 168     instance variable.  When a syntax error occurs, addtoken() raises
 169     the ParseError exception.  There is no error recovery; the parser
 170     cannot be used after a syntax error was reported (but it can be
 171     reinitialized by calling setup()).
 172
 173     """
 174
 175     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 176         """Constructor.
 177
 178         The grammar argument is a grammar.Grammar instance; see the
 179         grammar module for more information.
 180
 181         The parser is not ready yet for parsing; you must call the
 182         setup() method to get it started.
 183
 184         The optional convert argument is a function mapping concrete
 185         syntax tree nodes to abstract syntax tree nodes.  If not
 186         given, no conversion is done and the syntax tree produced is
 187         the concrete syntax tree.  If given, it must be a function of
 188         two arguments, the first being the grammar (a grammar.Grammar
 189         instance), and the second being the concrete syntax tree node
 190         to be converted.  The syntax tree is converted from the bottom
 191         up.
 192
 193         **post-note: the convert argument is ignored since for Black's
 194         usage, convert will always be blib2to3.pytree.convert. Allowing
 195         this to be dynamic hurts mypyc's ability to use early binding.
 196         These docs are left for historical and informational value.
 197
 198         A concrete syntax tree node is a (type, value, context, nodes)
 199         tuple, where type is the node type (a token or symbol number),
 200         value is None for symbols and a string for tokens, context is
 201         None or an opaque value used for error reporting (typically a
 202         (lineno, offset) pair), and nodes is a list of children for
 203         symbols, and None for tokens.
 204
 205         An abstract syntax tree node may be anything; this is entirely
 206         up to the converter function.
 207
 208         """
 209         self.grammar = grammar
 210         # See note in docstring above. TL;DR this is ignored.
 211         self.convert = convert or lam_sub
 212         self.is_backtracking = False
 213
 214     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 215         """Prepare for parsing.
 216
 217         This *must* be called before starting to parse.
 218
 219         The optional argument is an alternative start symbol; it
 220         defaults to the grammar's start symbol.
 221
 222         You can use a Parser instance to parse any number of programs;
 223         each time you call setup() the parser is reset to an initial
 224         state determined by the (implicit or explicit) start symbol.
 225
 226         """
 227         if start is None:
 228             start = self.grammar.start
 229         # Each stack entry is a tuple: (dfa, state, node).
 230         # A node is a tuple: (type, value, context, children),
 231         # where children is a list of nodes or None, and context may be None.
 232         newnode: RawNode = (start, None, None, [])
 233         stackentry = (self.grammar.dfas[start], 0, newnode)
 234         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 235         self.rootnode: Optional[NL] = None
 236         self.used_names: Set[str] = set()
 237         self.proxy = proxy
 238
 239     def addtoken(self, type: int, value: Text, context: Context) -> bool:
 240         """Add a token; return True iff this is the end of the program."""
 241         # Map from token to label
 242         ilabels = self.classify(type, value, context)
 243         assert len(ilabels) >= 1
 244
 245         # If we have only one state to advance, we'll directly
 246         # take it as is.
 247         if len(ilabels) == 1:
 248             [ilabel] = ilabels
 249             return self._addtoken(ilabel, type, value, context)
 250
 251         # If there are multiple states which we can advance (only
 252         # happen under soft-keywords), then we will try all of them
 253         # in parallel and as soon as one state can reach further than
 254         # the rest, we'll choose that one. This is a pretty hacky
 255         # and hopefully temporary algorithm.
 256         #
 257         # For a more detailed explanation, check out this post:
 258         # https://tree.science/what-the-backtracking.html
 259
 260         with self.proxy.release() as proxy:
 261             counter, force = 0, False
 262             recorder = Recorder(self, ilabels, context)
 263             recorder.add_token(type, value, raw=True)
 264
 265             next_token_value = value
 266             while recorder.determine_route(next_token_value) is None:
 267                 if not proxy.can_advance(counter):
 268                     force = True
 269                     break
 270
 271                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 272                 if next_token_type == tokenize.OP:
 273                     next_token_type = grammar.opmap[next_token_value]
 274
 275                 recorder.add_token(next_token_type, next_token_value)
 276                 counter += 1
 277
 278             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 279             assert ilabel is not None
 280
 281         return self._addtoken(ilabel, type, value, context)
 282
 283     def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
 284         # Loop until the token is shifted; may raise exceptions
 285         while True:
 286             dfa, state, node = self.stack[-1]
 287             states, first = dfa
 288             arcs = states[state]
 289             # Look for a state with this label
 290             for i, newstate in arcs:
 291                 t = self.grammar.labels[i][0]
 292                 if t >= 256:
 293                     # See if it's a symbol and if we're in its first set
 294                     itsdfa = self.grammar.dfas[t]
 295                     itsstates, itsfirst = itsdfa
 296                     if ilabel in itsfirst:
 297                         # Push a symbol
 298                         self.push(t, itsdfa, newstate, context)
 299                         break  # To continue the outer while loop
 300
 301                 elif ilabel == i:
 302                     # Look it up in the list of labels
 303                     # Shift a token; we're done with it
 304                     self.shift(type, value, newstate, context)
 305                     # Pop while we are in an accept-only state
 306                     state = newstate
 307                     while states[state] == [(0, state)]:
 308                         self.pop()
 309                         if not self.stack:
 310                             # Done parsing!
 311                             return True
 312                         dfa, state, node = self.stack[-1]
 313                         states, first = dfa
 314                     # Done with this token
 315                     return False
 316
 317             else:
 318                 if (0, state) in arcs:
 319                     # An accepting state, pop it and try something else
 320                     self.pop()
 321                     if not self.stack:
 322                         # Done parsing, but another token is input
 323                         raise ParseError("too much input", type, value, context)
 324                 else:
 325                     # No success finding a transition
 326                     raise ParseError("bad input", type, value, context)
 327
 328     def classify(self, type: int, value: Text, context: Context) -> List[int]:
 329         """Turn a token into a label.  (Internal)
 330
 331         Depending on whether the value is a soft-keyword or not,
 332         this function may return multiple labels to choose from."""
 333         if type == token.NAME:
 334             # Keep a listing of all used names
 335             self.used_names.add(value)
 336             # Check for reserved words
 337             if value in self.grammar.keywords:
 338                 return [self.grammar.keywords[value]]
 339             elif value in self.grammar.soft_keywords:
 340                 assert type in self.grammar.tokens
 341                 return [
 342                     self.grammar.soft_keywords[value],
 343                     self.grammar.tokens[type],
 344                 ]
 345
 346         ilabel = self.grammar.tokens.get(type)
 347         if ilabel is None:
 348             raise ParseError("bad token", type, value, context)
 349         return [ilabel]
 350
 351     def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
 352         """Shift a token.  (Internal)"""
 353         if self.is_backtracking:
 354             dfa, state, _ = self.stack[-1]
 355             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 356         else:
 357             dfa, state, node = self.stack[-1]
 358             rawnode: RawNode = (type, value, context, None)
 359             newnode = convert(self.grammar, rawnode)
 360             assert node[-1] is not None
 361             node[-1].append(newnode)
 362             self.stack[-1] = (dfa, newstate, node)
 363
 364     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 365         """Push a nonterminal.  (Internal)"""
 366         if self.is_backtracking:
 367             dfa, state, _ = self.stack[-1]
 368             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 369             self.stack.append((newdfa, 0, DUMMY_NODE))
 370         else:
 371             dfa, state, node = self.stack[-1]
 372             newnode: RawNode = (type, None, context, [])
 373             self.stack[-1] = (dfa, newstate, node)
 374             self.stack.append((newdfa, 0, newnode))
 375
 376     def pop(self) -> None:
 377         """Pop a nonterminal.  (Internal)"""
 378         if self.is_backtracking:
 379             self.stack.pop()
 380         else:
 381             popdfa, popstate, popnode = self.stack.pop()
 382             newnode = convert(self.grammar, popnode)
 383             if self.stack:
 384                 dfa, state, node = self.stack[-1]
 385                 assert node[-1] is not None
 386                 node[-1].append(newnode)
 387             else:
 388                 self.rootnode = newnode
 389                 self.rootnode.used_names = self.used_names