src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 import copy
  13 from contextlib import contextmanager
  14
  15 # Local imports
  16 from . import grammar, token, tokenize
  17 from typing import (
  18     cast,
  19     Any,
  20     Optional,
  21     Text,
  22     Union,
  23     Tuple,
  24     Dict,
  25     List,
  26     Iterator,
  27     Callable,
  28     Set,
  29     TYPE_CHECKING,
  30 )
  31 from blib2to3.pgen2.grammar import Grammar
  32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
  33
  34 if TYPE_CHECKING:
  35     from blib2to3.driver import TokenProxy
  36
  37
  38 Results = Dict[Text, NL]
  39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  40 DFA = List[List[Tuple[int, int]]]
  41 DFAS = Tuple[DFA, Dict[int, int]]
  42
  43
  44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  45     assert node[3] is not None
  46     return Node(type=node[0], children=node[3], context=node[2])
  47
  48
  49 class Recorder:
  50     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  51         self.parser = parser
  52         self._ilabels = ilabels
  53         self.context = context  # not really matter
  54
  55         self._dead_ilabels: Set[int] = set()
  56         self._start_point = self.parser.stack
  57         self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
  58
  59     @property
  60     def ilabels(self) -> Set[int]:
  61         return self._dead_ilabels.symmetric_difference(self._ilabels)
  62
  63     @contextmanager
  64     def switch_to(self, ilabel: int) -> Iterator[None]:
  65         self.parser.stack = self._points[ilabel]
  66         try:
  67             yield
  68         except ParseError:
  69             self._dead_ilabels.add(ilabel)
  70         finally:
  71             self.parser.stack = self._start_point
  72
  73     def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
  74         func: Callable[..., Any]
  75         if raw:
  76             func = self.parser._addtoken
  77         else:
  78             func = self.parser.addtoken
  79
  80         for ilabel in self.ilabels:
  81             with self.switch_to(ilabel):
  82                 args = [tok_type, tok_val, self.context]
  83                 if raw:
  84                     args.insert(0, ilabel)
  85                 func(*args)
  86
  87     def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
  88         alive_ilabels = self.ilabels
  89         if len(alive_ilabels) == 0:
  90             *_, most_successful_ilabel = self._dead_ilabels
  91             raise ParseError("bad input", most_successful_ilabel, value, self.context)
  92
  93         ilabel, *rest = alive_ilabels
  94         if force or not rest:
  95             return ilabel
  96         else:
  97             return None
  98
  99
 100 class ParseError(Exception):
 101     """Exception to signal the parser is stuck."""
 102
 103     def __init__(
 104         self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
 105     ) -> None:
 106         Exception.__init__(
 107             self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
 108         )
 109         self.msg = msg
 110         self.type = type
 111         self.value = value
 112         self.context = context
 113
 114
 115 class Parser(object):
 116     """Parser engine.
 117
 118     The proper usage sequence is:
 119
 120     p = Parser(grammar, [converter])  # create instance
 121     p.setup([start])                  # prepare for parsing
 122     <for each input token>:
 123         if p.addtoken(...):           # parse a token; may raise ParseError
 124             break
 125     root = p.rootnode                 # root of abstract syntax tree
 126
 127     A Parser instance may be reused by calling setup() repeatedly.
 128
 129     A Parser instance contains state pertaining to the current token
 130     sequence, and should not be used concurrently by different threads
 131     to parse separate token sequences.
 132
 133     See driver.py for how to get input tokens by tokenizing a file or
 134     string.
 135
 136     Parsing is complete when addtoken() returns True; the root of the
 137     abstract syntax tree can then be retrieved from the rootnode
 138     instance variable.  When a syntax error occurs, addtoken() raises
 139     the ParseError exception.  There is no error recovery; the parser
 140     cannot be used after a syntax error was reported (but it can be
 141     reinitialized by calling setup()).
 142
 143     """
 144
 145     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 146         """Constructor.
 147
 148         The grammar argument is a grammar.Grammar instance; see the
 149         grammar module for more information.
 150
 151         The parser is not ready yet for parsing; you must call the
 152         setup() method to get it started.
 153
 154         The optional convert argument is a function mapping concrete
 155         syntax tree nodes to abstract syntax tree nodes.  If not
 156         given, no conversion is done and the syntax tree produced is
 157         the concrete syntax tree.  If given, it must be a function of
 158         two arguments, the first being the grammar (a grammar.Grammar
 159         instance), and the second being the concrete syntax tree node
 160         to be converted.  The syntax tree is converted from the bottom
 161         up.
 162
 163         **post-note: the convert argument is ignored since for Black's
 164         usage, convert will always be blib2to3.pytree.convert. Allowing
 165         this to be dynamic hurts mypyc's ability to use early binding.
 166         These docs are left for historical and informational value.
 167
 168         A concrete syntax tree node is a (type, value, context, nodes)
 169         tuple, where type is the node type (a token or symbol number),
 170         value is None for symbols and a string for tokens, context is
 171         None or an opaque value used for error reporting (typically a
 172         (lineno, offset) pair), and nodes is a list of children for
 173         symbols, and None for tokens.
 174
 175         An abstract syntax tree node may be anything; this is entirely
 176         up to the converter function.
 177
 178         """
 179         self.grammar = grammar
 180         # See note in docstring above. TL;DR this is ignored.
 181         self.convert = convert or lam_sub
 182
 183     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 184         """Prepare for parsing.
 185
 186         This *must* be called before starting to parse.
 187
 188         The optional argument is an alternative start symbol; it
 189         defaults to the grammar's start symbol.
 190
 191         You can use a Parser instance to parse any number of programs;
 192         each time you call setup() the parser is reset to an initial
 193         state determined by the (implicit or explicit) start symbol.
 194
 195         """
 196         if start is None:
 197             start = self.grammar.start
 198         # Each stack entry is a tuple: (dfa, state, node).
 199         # A node is a tuple: (type, value, context, children),
 200         # where children is a list of nodes or None, and context may be None.
 201         newnode: RawNode = (start, None, None, [])
 202         stackentry = (self.grammar.dfas[start], 0, newnode)
 203         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 204         self.rootnode: Optional[NL] = None
 205         self.used_names: Set[str] = set()
 206         self.proxy = proxy
 207
 208     def addtoken(self, type: int, value: Text, context: Context) -> bool:
 209         """Add a token; return True iff this is the end of the program."""
 210         # Map from token to label
 211         ilabels = self.classify(type, value, context)
 212         assert len(ilabels) >= 1
 213
 214         # If we have only one state to advance, we'll directly
 215         # take it as is.
 216         if len(ilabels) == 1:
 217             [ilabel] = ilabels
 218             return self._addtoken(ilabel, type, value, context)
 219
 220         # If there are multiple states which we can advance (only
 221         # happen under soft-keywords), then we will try all of them
 222         # in parallel and as soon as one state can reach further than
 223         # the rest, we'll choose that one. This is a pretty hacky
 224         # and hopefully temporary algorithm.
 225         #
 226         # For a more detailed explanation, check out this post:
 227         # https://tree.science/what-the-backtracking.html
 228
 229         with self.proxy.release() as proxy:
 230             counter, force = 0, False
 231             recorder = Recorder(self, ilabels, context)
 232             recorder.add_token(type, value, raw=True)
 233
 234             next_token_value = value
 235             while recorder.determine_route(next_token_value) is None:
 236                 if not proxy.can_advance(counter):
 237                     force = True
 238                     break
 239
 240                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 241                 if next_token_type == tokenize.OP:
 242                     next_token_type = grammar.opmap[next_token_value]
 243
 244                 recorder.add_token(next_token_type, next_token_value)
 245                 counter += 1
 246
 247             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 248             assert ilabel is not None
 249
 250         return self._addtoken(ilabel, type, value, context)
 251
 252     def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
 253         # Loop until the token is shifted; may raise exceptions
 254         while True:
 255             dfa, state, node = self.stack[-1]
 256             states, first = dfa
 257             arcs = states[state]
 258             # Look for a state with this label
 259             for i, newstate in arcs:
 260                 t = self.grammar.labels[i][0]
 261                 if t >= 256:
 262                     # See if it's a symbol and if we're in its first set
 263                     itsdfa = self.grammar.dfas[t]
 264                     itsstates, itsfirst = itsdfa
 265                     if ilabel in itsfirst:
 266                         # Push a symbol
 267                         self.push(t, itsdfa, newstate, context)
 268                         break  # To continue the outer while loop
 269
 270                 elif ilabel == i:
 271                     # Look it up in the list of labels
 272                     # Shift a token; we're done with it
 273                     self.shift(type, value, newstate, context)
 274                     # Pop while we are in an accept-only state
 275                     state = newstate
 276                     while states[state] == [(0, state)]:
 277                         self.pop()
 278                         if not self.stack:
 279                             # Done parsing!
 280                             return True
 281                         dfa, state, node = self.stack[-1]
 282                         states, first = dfa
 283                     # Done with this token
 284                     return False
 285
 286             else:
 287                 if (0, state) in arcs:
 288                     # An accepting state, pop it and try something else
 289                     self.pop()
 290                     if not self.stack:
 291                         # Done parsing, but another token is input
 292                         raise ParseError("too much input", type, value, context)
 293                 else:
 294                     # No success finding a transition
 295                     raise ParseError("bad input", type, value, context)
 296
 297     def classify(self, type: int, value: Text, context: Context) -> List[int]:
 298         """Turn a token into a label.  (Internal)
 299
 300         Depending on whether the value is a soft-keyword or not,
 301         this function may return multiple labels to choose from."""
 302         if type == token.NAME:
 303             # Keep a listing of all used names
 304             self.used_names.add(value)
 305             # Check for reserved words
 306             if value in self.grammar.keywords:
 307                 return [self.grammar.keywords[value]]
 308             elif value in self.grammar.soft_keywords:
 309                 assert type in self.grammar.tokens
 310                 return [
 311                     self.grammar.soft_keywords[value],
 312                     self.grammar.tokens[type],
 313                 ]
 314
 315         ilabel = self.grammar.tokens.get(type)
 316         if ilabel is None:
 317             raise ParseError("bad token", type, value, context)
 318         return [ilabel]
 319
 320     def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
 321         """Shift a token.  (Internal)"""
 322         dfa, state, node = self.stack[-1]
 323         rawnode: RawNode = (type, value, context, None)
 324         newnode = convert(self.grammar, rawnode)
 325         assert node[-1] is not None
 326         node[-1].append(newnode)
 327         self.stack[-1] = (dfa, newstate, node)
 328
 329     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 330         """Push a nonterminal.  (Internal)"""
 331         dfa, state, node = self.stack[-1]
 332         newnode: RawNode = (type, None, context, [])
 333         self.stack[-1] = (dfa, newstate, node)
 334         self.stack.append((newdfa, 0, newnode))
 335
 336     def pop(self) -> None:
 337         """Pop a nonterminal.  (Internal)"""
 338         popdfa, popstate, popnode = self.stack.pop()
 339         newnode = convert(self.grammar, popnode)
 340         if self.stack:
 341             dfa, state, node = self.stack[-1]
 342             assert node[-1] is not None
 343             node[-1].append(newnode)
 344         else:
 345             self.rootnode = newnode
 346             self.rootnode.used_names = self.used_names