src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 import copy
  13 from contextlib import contextmanager
  14
  15 # Local imports
  16 from . import grammar, token, tokenize
  17 from typing import (
  18     cast,
  19     Any,
  20     Optional,
  21     Text,
  22     Union,
  23     Tuple,
  24     Dict,
  25     List,
  26     Iterator,
  27     Callable,
  28     Set,
  29     TYPE_CHECKING,
  30 )
  31 from blib2to3.pgen2.grammar import Grammar
  32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
  33
  34 if TYPE_CHECKING:
  35     from blib2to3.driver import TokenProxy
  36
  37
  38 Results = Dict[Text, NL]
  39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  40 DFA = List[List[Tuple[int, int]]]
  41 DFAS = Tuple[DFA, Dict[int, int]]
  42
  43
  44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  45     assert node[3] is not None
  46     return Node(type=node[0], children=node[3], context=node[2])
  47
  48
  49 # A placeholder node, used when parser is backtracking.
  50 DUMMY_NODE = (-1, None, None, None)
  51
  52
  53 def stack_copy(
  54     stack: List[Tuple[DFAS, int, RawNode]]
  55 ) -> List[Tuple[DFAS, int, RawNode]]:
  56     """Nodeless stack copy."""
  57     return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
  58
  59
  60 class Recorder:
  61     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  62         self.parser = parser
  63         self._ilabels = ilabels
  64         self.context = context  # not really matter
  65
  66         self._dead_ilabels: Set[int] = set()
  67         self._start_point = self.parser.stack
  68         self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
  69
  70     @property
  71     def ilabels(self) -> Set[int]:
  72         return self._dead_ilabels.symmetric_difference(self._ilabels)
  73
  74     @contextmanager
  75     def switch_to(self, ilabel: int) -> Iterator[None]:
  76         with self.backtrack():
  77             self.parser.stack = self._points[ilabel]
  78             try:
  79                 yield
  80             except ParseError:
  81                 self._dead_ilabels.add(ilabel)
  82             finally:
  83                 self.parser.stack = self._start_point
  84
  85     @contextmanager
  86     def backtrack(self) -> Iterator[None]:
  87         """
  88         Use the node-level invariant ones for basic parsing operations (push/pop/shift).
  89         These still will operate on the stack; but they won't create any new nodes, or
  90         modify the contents of any other existing nodes.
  91
  92         This saves us a ton of time when we are backtracking, since we
  93         want to restore to the initial state as quick as possible, which
  94         can only be done by having as little mutatations as possible.
  95         """
  96         is_backtracking = self.parser.is_backtracking
  97         try:
  98             self.parser.is_backtracking = True
  99             yield
 100         finally:
 101             self.parser.is_backtracking = is_backtracking
 102
 103     def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
 104         func: Callable[..., Any]
 105         if raw:
 106             func = self.parser._addtoken
 107         else:
 108             func = self.parser.addtoken
 109
 110         for ilabel in self.ilabels:
 111             with self.switch_to(ilabel):
 112                 args = [tok_type, tok_val, self.context]
 113                 if raw:
 114                     args.insert(0, ilabel)
 115                 func(*args)
 116
 117     def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
 118         alive_ilabels = self.ilabels
 119         if len(alive_ilabels) == 0:
 120             *_, most_successful_ilabel = self._dead_ilabels
 121             raise ParseError("bad input", most_successful_ilabel, value, self.context)
 122
 123         ilabel, *rest = alive_ilabels
 124         if force or not rest:
 125             return ilabel
 126         else:
 127             return None
 128
 129
 130 class ParseError(Exception):
 131     """Exception to signal the parser is stuck."""
 132
 133     def __init__(
 134         self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
 135     ) -> None:
 136         Exception.__init__(
 137             self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
 138         )
 139         self.msg = msg
 140         self.type = type
 141         self.value = value
 142         self.context = context
 143
 144
 145 class Parser(object):
 146     """Parser engine.
 147
 148     The proper usage sequence is:
 149
 150     p = Parser(grammar, [converter])  # create instance
 151     p.setup([start])                  # prepare for parsing
 152     <for each input token>:
 153         if p.addtoken(...):           # parse a token; may raise ParseError
 154             break
 155     root = p.rootnode                 # root of abstract syntax tree
 156
 157     A Parser instance may be reused by calling setup() repeatedly.
 158
 159     A Parser instance contains state pertaining to the current token
 160     sequence, and should not be used concurrently by different threads
 161     to parse separate token sequences.
 162
 163     See driver.py for how to get input tokens by tokenizing a file or
 164     string.
 165
 166     Parsing is complete when addtoken() returns True; the root of the
 167     abstract syntax tree can then be retrieved from the rootnode
 168     instance variable.  When a syntax error occurs, addtoken() raises
 169     the ParseError exception.  There is no error recovery; the parser
 170     cannot be used after a syntax error was reported (but it can be
 171     reinitialized by calling setup()).
 172
 173     """
 174
 175     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 176         """Constructor.
 177
 178         The grammar argument is a grammar.Grammar instance; see the
 179         grammar module for more information.
 180
 181         The parser is not ready yet for parsing; you must call the
 182         setup() method to get it started.
 183
 184         The optional convert argument is a function mapping concrete
 185         syntax tree nodes to abstract syntax tree nodes.  If not
 186         given, no conversion is done and the syntax tree produced is
 187         the concrete syntax tree.  If given, it must be a function of
 188         two arguments, the first being the grammar (a grammar.Grammar
 189         instance), and the second being the concrete syntax tree node
 190         to be converted.  The syntax tree is converted from the bottom
 191         up.
 192
 193         **post-note: the convert argument is ignored since for Black's
 194         usage, convert will always be blib2to3.pytree.convert. Allowing
 195         this to be dynamic hurts mypyc's ability to use early binding.
 196         These docs are left for historical and informational value.
 197
 198         A concrete syntax tree node is a (type, value, context, nodes)
 199         tuple, where type is the node type (a token or symbol number),
 200         value is None for symbols and a string for tokens, context is
 201         None or an opaque value used for error reporting (typically a
 202         (lineno, offset) pair), and nodes is a list of children for
 203         symbols, and None for tokens.
 204
 205         An abstract syntax tree node may be anything; this is entirely
 206         up to the converter function.
 207
 208         """
 209         self.grammar = grammar
 210         # See note in docstring above. TL;DR this is ignored.
 211         self.convert = convert or lam_sub
 212         self.is_backtracking = False
 213
 214     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 215         """Prepare for parsing.
 216
 217         This *must* be called before starting to parse.
 218
 219         The optional argument is an alternative start symbol; it
 220         defaults to the grammar's start symbol.
 221
 222         You can use a Parser instance to parse any number of programs;
 223         each time you call setup() the parser is reset to an initial
 224         state determined by the (implicit or explicit) start symbol.
 225
 226         """
 227         if start is None:
 228             start = self.grammar.start
 229         # Each stack entry is a tuple: (dfa, state, node).
 230         # A node is a tuple: (type, value, context, children),
 231         # where children is a list of nodes or None, and context may be None.
 232         newnode: RawNode = (start, None, None, [])
 233         stackentry = (self.grammar.dfas[start], 0, newnode)
 234         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 235         self.rootnode: Optional[NL] = None
 236         self.used_names: Set[str] = set()
 237         self.proxy = proxy
 238
 239     def addtoken(self, type: int, value: Text, context: Context) -> bool:
 240         """Add a token; return True iff this is the end of the program."""
 241         # Map from token to label
 242         ilabels = self.classify(type, value, context)
 243         assert len(ilabels) >= 1
 244
 245         # If we have only one state to advance, we'll directly
 246         # take it as is.
 247         if len(ilabels) == 1:
 248             [ilabel] = ilabels
 249             return self._addtoken(ilabel, type, value, context)
 250
 251         # If there are multiple states which we can advance (only
 252         # happen under soft-keywords), then we will try all of them
 253         # in parallel and as soon as one state can reach further than
 254         # the rest, we'll choose that one. This is a pretty hacky
 255         # and hopefully temporary algorithm.
 256         #
 257         # For a more detailed explanation, check out this post:
 258         # https://tree.science/what-the-backtracking.html
 259
 260         with self.proxy.release() as proxy:
 261             counter, force = 0, False
 262             recorder = Recorder(self, ilabels, context)
 263             recorder.add_token(type, value, raw=True)
 264
 265             next_token_value = value
 266             while recorder.determine_route(next_token_value) is None:
 267                 if not proxy.can_advance(counter):
 268                     force = True
 269                     break
 270
 271                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 272                 if next_token_type in (tokenize.COMMENT, tokenize.NL):
 273                     counter += 1
 274                     continue
 275
 276                 if next_token_type == tokenize.OP:
 277                     next_token_type = grammar.opmap[next_token_value]
 278
 279                 recorder.add_token(next_token_type, next_token_value)
 280                 counter += 1
 281
 282             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 283             assert ilabel is not None
 284
 285         return self._addtoken(ilabel, type, value, context)
 286
 287     def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
 288         # Loop until the token is shifted; may raise exceptions
 289         while True:
 290             dfa, state, node = self.stack[-1]
 291             states, first = dfa
 292             arcs = states[state]
 293             # Look for a state with this label
 294             for i, newstate in arcs:
 295                 t = self.grammar.labels[i][0]
 296                 if t >= 256:
 297                     # See if it's a symbol and if we're in its first set
 298                     itsdfa = self.grammar.dfas[t]
 299                     itsstates, itsfirst = itsdfa
 300                     if ilabel in itsfirst:
 301                         # Push a symbol
 302                         self.push(t, itsdfa, newstate, context)
 303                         break  # To continue the outer while loop
 304
 305                 elif ilabel == i:
 306                     # Look it up in the list of labels
 307                     # Shift a token; we're done with it
 308                     self.shift(type, value, newstate, context)
 309                     # Pop while we are in an accept-only state
 310                     state = newstate
 311                     while states[state] == [(0, state)]:
 312                         self.pop()
 313                         if not self.stack:
 314                             # Done parsing!
 315                             return True
 316                         dfa, state, node = self.stack[-1]
 317                         states, first = dfa
 318                     # Done with this token
 319                     return False
 320
 321             else:
 322                 if (0, state) in arcs:
 323                     # An accepting state, pop it and try something else
 324                     self.pop()
 325                     if not self.stack:
 326                         # Done parsing, but another token is input
 327                         raise ParseError("too much input", type, value, context)
 328                 else:
 329                     # No success finding a transition
 330                     raise ParseError("bad input", type, value, context)
 331
 332     def classify(self, type: int, value: Text, context: Context) -> List[int]:
 333         """Turn a token into a label.  (Internal)
 334
 335         Depending on whether the value is a soft-keyword or not,
 336         this function may return multiple labels to choose from."""
 337         if type == token.NAME:
 338             # Keep a listing of all used names
 339             self.used_names.add(value)
 340             # Check for reserved words
 341             if value in self.grammar.keywords:
 342                 return [self.grammar.keywords[value]]
 343             elif value in self.grammar.soft_keywords:
 344                 assert type in self.grammar.tokens
 345                 return [
 346                     self.grammar.soft_keywords[value],
 347                     self.grammar.tokens[type],
 348                 ]
 349
 350         ilabel = self.grammar.tokens.get(type)
 351         if ilabel is None:
 352             raise ParseError("bad token", type, value, context)
 353         return [ilabel]
 354
 355     def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
 356         """Shift a token.  (Internal)"""
 357         if self.is_backtracking:
 358             dfa, state, _ = self.stack[-1]
 359             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 360         else:
 361             dfa, state, node = self.stack[-1]
 362             rawnode: RawNode = (type, value, context, None)
 363             newnode = convert(self.grammar, rawnode)
 364             assert node[-1] is not None
 365             node[-1].append(newnode)
 366             self.stack[-1] = (dfa, newstate, node)
 367
 368     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 369         """Push a nonterminal.  (Internal)"""
 370         if self.is_backtracking:
 371             dfa, state, _ = self.stack[-1]
 372             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 373             self.stack.append((newdfa, 0, DUMMY_NODE))
 374         else:
 375             dfa, state, node = self.stack[-1]
 376             newnode: RawNode = (type, None, context, [])
 377             self.stack[-1] = (dfa, newstate, node)
 378             self.stack.append((newdfa, 0, newnode))
 379
 380     def pop(self) -> None:
 381         """Pop a nonterminal.  (Internal)"""
 382         if self.is_backtracking:
 383             self.stack.pop()
 384         else:
 385             popdfa, popstate, popnode = self.stack.pop()
 386             newnode = convert(self.grammar, popnode)
 387             if self.stack:
 388                 dfa, state, node = self.stack[-1]
 389                 assert node[-1] is not None
 390                 node[-1].append(newnode)
 391             else:
 392                 self.rootnode = newnode
 393                 self.rootnode.used_names = self.used_names