src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 import copy
  13 from contextlib import contextmanager
  14
  15 # Local imports
  16 from . import grammar, token, tokenize
  17 from typing import (
  18     cast,
  19     Any,
  20     Optional,
  21     Text,
  22     Union,
  23     Tuple,
  24     Dict,
  25     List,
  26     Iterator,
  27     Callable,
  28     Set,
  29     TYPE_CHECKING,
  30 )
  31 from blib2to3.pgen2.grammar import Grammar
  32 from blib2to3.pytree import NL, Context, RawNode, Leaf, Node
  33
  34 if TYPE_CHECKING:
  35     from blib2to3.driver import TokenProxy
  36
  37
  38 Results = Dict[Text, NL]
  39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  40 DFA = List[List[Tuple[int, int]]]
  41 DFAS = Tuple[DFA, Dict[int, int]]
  42
  43
  44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  45     assert node[3] is not None
  46     return Node(type=node[0], children=node[3], context=node[2])
  47
  48
  49 class Recorder:
  50     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  51         self.parser = parser
  52         self._ilabels = ilabels
  53         self.context = context  # not really matter
  54
  55         self._dead_ilabels: Set[int] = set()
  56         self._start_point = copy.deepcopy(self.parser.stack)
  57         self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
  58
  59     @property
  60     def ilabels(self) -> Set[int]:
  61         return self._dead_ilabels.symmetric_difference(self._ilabels)
  62
  63     @contextmanager
  64     def switch_to(self, ilabel: int) -> Iterator[None]:
  65         self.parser.stack = self._points[ilabel]
  66         try:
  67             yield
  68         except ParseError:
  69             self._dead_ilabels.add(ilabel)
  70         finally:
  71             self.parser.stack = self._start_point
  72
  73     def add_token(
  74         self, tok_type: int, tok_val: Optional[Text], raw: bool = False
  75     ) -> None:
  76         func: Callable[..., Any]
  77         if raw:
  78             func = self.parser._addtoken
  79         else:
  80             func = self.parser.addtoken
  81
  82         for ilabel in self.ilabels:
  83             with self.switch_to(ilabel):
  84                 args = [tok_type, tok_val, self.context]
  85                 if raw:
  86                     args.insert(0, ilabel)
  87                 func(*args)
  88
  89     def determine_route(
  90         self, value: Optional[Text] = None, force: bool = False
  91     ) -> Optional[int]:
  92         alive_ilabels = self.ilabels
  93         if len(alive_ilabels) == 0:
  94             *_, most_successful_ilabel = self._dead_ilabels
  95             raise ParseError("bad input", most_successful_ilabel, value, self.context)
  96
  97         ilabel, *rest = alive_ilabels
  98         if force or not rest:
  99             return ilabel
 100         else:
 101             return None
 102
 103
 104 class ParseError(Exception):
 105     """Exception to signal the parser is stuck."""
 106
 107     def __init__(
 108         self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
 109     ) -> None:
 110         Exception.__init__(
 111             self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
 112         )
 113         self.msg = msg
 114         self.type = type
 115         self.value = value
 116         self.context = context
 117
 118
 119 class Parser(object):
 120     """Parser engine.
 121
 122     The proper usage sequence is:
 123
 124     p = Parser(grammar, [converter])  # create instance
 125     p.setup([start])                  # prepare for parsing
 126     <for each input token>:
 127         if p.addtoken(...):           # parse a token; may raise ParseError
 128             break
 129     root = p.rootnode                 # root of abstract syntax tree
 130
 131     A Parser instance may be reused by calling setup() repeatedly.
 132
 133     A Parser instance contains state pertaining to the current token
 134     sequence, and should not be used concurrently by different threads
 135     to parse separate token sequences.
 136
 137     See driver.py for how to get input tokens by tokenizing a file or
 138     string.
 139
 140     Parsing is complete when addtoken() returns True; the root of the
 141     abstract syntax tree can then be retrieved from the rootnode
 142     instance variable.  When a syntax error occurs, addtoken() raises
 143     the ParseError exception.  There is no error recovery; the parser
 144     cannot be used after a syntax error was reported (but it can be
 145     reinitialized by calling setup()).
 146
 147     """
 148
 149     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 150         """Constructor.
 151
 152         The grammar argument is a grammar.Grammar instance; see the
 153         grammar module for more information.
 154
 155         The parser is not ready yet for parsing; you must call the
 156         setup() method to get it started.
 157
 158         The optional convert argument is a function mapping concrete
 159         syntax tree nodes to abstract syntax tree nodes.  If not
 160         given, no conversion is done and the syntax tree produced is
 161         the concrete syntax tree.  If given, it must be a function of
 162         two arguments, the first being the grammar (a grammar.Grammar
 163         instance), and the second being the concrete syntax tree node
 164         to be converted.  The syntax tree is converted from the bottom
 165         up.
 166
 167         A concrete syntax tree node is a (type, value, context, nodes)
 168         tuple, where type is the node type (a token or symbol number),
 169         value is None for symbols and a string for tokens, context is
 170         None or an opaque value used for error reporting (typically a
 171         (lineno, offset) pair), and nodes is a list of children for
 172         symbols, and None for tokens.
 173
 174         An abstract syntax tree node may be anything; this is entirely
 175         up to the converter function.
 176
 177         """
 178         self.grammar = grammar
 179         self.convert = convert or lam_sub
 180
 181     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 182         """Prepare for parsing.
 183
 184         This *must* be called before starting to parse.
 185
 186         The optional argument is an alternative start symbol; it
 187         defaults to the grammar's start symbol.
 188
 189         You can use a Parser instance to parse any number of programs;
 190         each time you call setup() the parser is reset to an initial
 191         state determined by the (implicit or explicit) start symbol.
 192
 193         """
 194         if start is None:
 195             start = self.grammar.start
 196         # Each stack entry is a tuple: (dfa, state, node).
 197         # A node is a tuple: (type, value, context, children),
 198         # where children is a list of nodes or None, and context may be None.
 199         newnode: RawNode = (start, None, None, [])
 200         stackentry = (self.grammar.dfas[start], 0, newnode)
 201         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 202         self.rootnode: Optional[NL] = None
 203         self.used_names: Set[str] = set()
 204         self.proxy = proxy
 205
 206     def addtoken(self, type: int, value: Optional[Text], context: Context) -> bool:
 207         """Add a token; return True iff this is the end of the program."""
 208         # Map from token to label
 209         ilabels = self.classify(type, value, context)
 210         assert len(ilabels) >= 1
 211
 212         # If we have only one state to advance, we'll directly
 213         # take it as is.
 214         if len(ilabels) == 1:
 215             [ilabel] = ilabels
 216             return self._addtoken(ilabel, type, value, context)
 217
 218         # If there are multiple states which we can advance (only
 219         # happen under soft-keywords), then we will try all of them
 220         # in parallel and as soon as one state can reach further than
 221         # the rest, we'll choose that one. This is a pretty hacky
 222         # and hopefully temporary algorithm.
 223         #
 224         # For a more detailed explanation, check out this post:
 225         # https://tree.science/what-the-backtracking.html
 226
 227         with self.proxy.release() as proxy:
 228             counter, force = 0, False
 229             recorder = Recorder(self, ilabels, context)
 230             recorder.add_token(type, value, raw=True)
 231
 232             next_token_value = value
 233             while recorder.determine_route(next_token_value) is None:
 234                 if not proxy.can_advance(counter):
 235                     force = True
 236                     break
 237
 238                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 239                 if next_token_type == tokenize.OP:
 240                     next_token_type = grammar.opmap[cast(str, next_token_value)]
 241
 242                 recorder.add_token(next_token_type, next_token_value)
 243                 counter += 1
 244
 245             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 246             assert ilabel is not None
 247
 248         return self._addtoken(ilabel, type, value, context)
 249
 250     def _addtoken(
 251         self, ilabel: int, type: int, value: Optional[Text], context: Context
 252     ) -> bool:
 253         # Loop until the token is shifted; may raise exceptions
 254         while True:
 255             dfa, state, node = self.stack[-1]
 256             states, first = dfa
 257             arcs = states[state]
 258             # Look for a state with this label
 259             for i, newstate in arcs:
 260                 t, v = self.grammar.labels[i]
 261                 if ilabel == i:
 262                     # Look it up in the list of labels
 263                     assert t < 256
 264                     # Shift a token; we're done with it
 265                     self.shift(type, value, newstate, context)
 266                     # Pop while we are in an accept-only state
 267                     state = newstate
 268                     while states[state] == [(0, state)]:
 269                         self.pop()
 270                         if not self.stack:
 271                             # Done parsing!
 272                             return True
 273                         dfa, state, node = self.stack[-1]
 274                         states, first = dfa
 275                     # Done with this token
 276                     return False
 277                 elif t >= 256:
 278                     # See if it's a symbol and if we're in its first set
 279                     itsdfa = self.grammar.dfas[t]
 280                     itsstates, itsfirst = itsdfa
 281                     if ilabel in itsfirst:
 282                         # Push a symbol
 283                         self.push(t, self.grammar.dfas[t], newstate, context)
 284                         break  # To continue the outer while loop
 285             else:
 286                 if (0, state) in arcs:
 287                     # An accepting state, pop it and try something else
 288                     self.pop()
 289                     if not self.stack:
 290                         # Done parsing, but another token is input
 291                         raise ParseError("too much input", type, value, context)
 292                 else:
 293                     # No success finding a transition
 294                     raise ParseError("bad input", type, value, context)
 295
 296     def classify(self, type: int, value: Optional[Text], context: Context) -> List[int]:
 297         """Turn a token into a label.  (Internal)
 298
 299         Depending on whether the value is a soft-keyword or not,
 300         this function may return multiple labels to choose from."""
 301         if type == token.NAME:
 302             # Keep a listing of all used names
 303             assert value is not None
 304             self.used_names.add(value)
 305             # Check for reserved words
 306             if value in self.grammar.keywords:
 307                 return [self.grammar.keywords[value]]
 308             elif value in self.grammar.soft_keywords:
 309                 assert type in self.grammar.tokens
 310                 return [
 311                     self.grammar.soft_keywords[value],
 312                     self.grammar.tokens[type],
 313                 ]
 314
 315         ilabel = self.grammar.tokens.get(type)
 316         if ilabel is None:
 317             raise ParseError("bad token", type, value, context)
 318         return [ilabel]
 319
 320     def shift(
 321         self, type: int, value: Optional[Text], newstate: int, context: Context
 322     ) -> None:
 323         """Shift a token.  (Internal)"""
 324         dfa, state, node = self.stack[-1]
 325         assert value is not None
 326         assert context is not None
 327         rawnode: RawNode = (type, value, context, None)
 328         newnode = self.convert(self.grammar, rawnode)
 329         if newnode is not None:
 330             assert node[-1] is not None
 331             node[-1].append(newnode)
 332         self.stack[-1] = (dfa, newstate, node)
 333
 334     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 335         """Push a nonterminal.  (Internal)"""
 336         dfa, state, node = self.stack[-1]
 337         newnode: RawNode = (type, None, context, [])
 338         self.stack[-1] = (dfa, newstate, node)
 339         self.stack.append((newdfa, 0, newnode))
 340
 341     def pop(self) -> None:
 342         """Pop a nonterminal.  (Internal)"""
 343         popdfa, popstate, popnode = self.stack.pop()
 344         newnode = self.convert(self.grammar, popnode)
 345         if newnode is not None:
 346             if self.stack:
 347                 dfa, state, node = self.stack[-1]
 348                 assert node[-1] is not None
 349                 node[-1].append(newnode)
 350             else:
 351                 self.rootnode = newnode
 352                 self.rootnode.used_names = self.used_names