src/blib2to3/pgen2/parse.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """Parser engine for the grammar tables generated by pgen.
   5
   6 The grammar table must be loaded first.
   7
   8 See Parser/parser.c in the Python distribution for additional info on
   9 how this parsing engine works.
  10
  11 """
  12 from contextlib import contextmanager
  13 from typing import (
  14     TYPE_CHECKING,
  15     Any,
  16     Callable,
  17     Dict,
  18     Iterator,
  19     List,
  20     Optional,
  21     Set,
  22     Tuple,
  23     Union,
  24     cast,
  25 )
  26
  27 from blib2to3.pgen2.grammar import Grammar
  28 from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
  29
  30 # Local imports
  31 from . import grammar, token, tokenize
  32
  33 if TYPE_CHECKING:
  34     from blib2to3.pgen2.driver import TokenProxy
  35
  36
  37 Results = Dict[str, NL]
  38 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
  39 DFA = List[List[Tuple[int, int]]]
  40 DFAS = Tuple[DFA, Dict[int, int]]
  41
  42
  43 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
  44     assert node[3] is not None
  45     return Node(type=node[0], children=node[3], context=node[2])
  46
  47
  48 # A placeholder node, used when parser is backtracking.
  49 DUMMY_NODE = (-1, None, None, None)
  50
  51
  52 def stack_copy(
  53     stack: List[Tuple[DFAS, int, RawNode]]
  54 ) -> List[Tuple[DFAS, int, RawNode]]:
  55     """Nodeless stack copy."""
  56     return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
  57
  58
  59 class Recorder:
  60     def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
  61         self.parser = parser
  62         self._ilabels = ilabels
  63         self.context = context  # not really matter
  64
  65         self._dead_ilabels: Set[int] = set()
  66         self._start_point = self.parser.stack
  67         self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
  68
  69     @property
  70     def ilabels(self) -> Set[int]:
  71         return self._dead_ilabels.symmetric_difference(self._ilabels)
  72
  73     @contextmanager
  74     def switch_to(self, ilabel: int) -> Iterator[None]:
  75         with self.backtrack():
  76             self.parser.stack = self._points[ilabel]
  77             try:
  78                 yield
  79             except ParseError:
  80                 self._dead_ilabels.add(ilabel)
  81             finally:
  82                 self.parser.stack = self._start_point
  83
  84     @contextmanager
  85     def backtrack(self) -> Iterator[None]:
  86         """
  87         Use the node-level invariant ones for basic parsing operations (push/pop/shift).
  88         These still will operate on the stack; but they won't create any new nodes, or
  89         modify the contents of any other existing nodes.
  90
  91         This saves us a ton of time when we are backtracking, since we
  92         want to restore to the initial state as quick as possible, which
  93         can only be done by having as little mutatations as possible.
  94         """
  95         is_backtracking = self.parser.is_backtracking
  96         try:
  97             self.parser.is_backtracking = True
  98             yield
  99         finally:
 100             self.parser.is_backtracking = is_backtracking
 101
 102     def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
 103         func: Callable[..., Any]
 104         if raw:
 105             func = self.parser._addtoken
 106         else:
 107             func = self.parser.addtoken
 108
 109         for ilabel in self.ilabels:
 110             with self.switch_to(ilabel):
 111                 args = [tok_type, tok_val, self.context]
 112                 if raw:
 113                     args.insert(0, ilabel)
 114                 func(*args)
 115
 116     def determine_route(
 117         self, value: Optional[str] = None, force: bool = False
 118     ) -> Optional[int]:
 119         alive_ilabels = self.ilabels
 120         if len(alive_ilabels) == 0:
 121             *_, most_successful_ilabel = self._dead_ilabels
 122             raise ParseError("bad input", most_successful_ilabel, value, self.context)
 123
 124         ilabel, *rest = alive_ilabels
 125         if force or not rest:
 126             return ilabel
 127         else:
 128             return None
 129
 130
 131 class ParseError(Exception):
 132     """Exception to signal the parser is stuck."""
 133
 134     def __init__(
 135         self, msg: str, type: Optional[int], value: Optional[str], context: Context
 136     ) -> None:
 137         Exception.__init__(
 138             self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
 139         )
 140         self.msg = msg
 141         self.type = type
 142         self.value = value
 143         self.context = context
 144
 145
 146 class Parser:
 147     """Parser engine.
 148
 149     The proper usage sequence is:
 150
 151     p = Parser(grammar, [converter])  # create instance
 152     p.setup([start])                  # prepare for parsing
 153     <for each input token>:
 154         if p.addtoken(...):           # parse a token; may raise ParseError
 155             break
 156     root = p.rootnode                 # root of abstract syntax tree
 157
 158     A Parser instance may be reused by calling setup() repeatedly.
 159
 160     A Parser instance contains state pertaining to the current token
 161     sequence, and should not be used concurrently by different threads
 162     to parse separate token sequences.
 163
 164     See driver.py for how to get input tokens by tokenizing a file or
 165     string.
 166
 167     Parsing is complete when addtoken() returns True; the root of the
 168     abstract syntax tree can then be retrieved from the rootnode
 169     instance variable.  When a syntax error occurs, addtoken() raises
 170     the ParseError exception.  There is no error recovery; the parser
 171     cannot be used after a syntax error was reported (but it can be
 172     reinitialized by calling setup()).
 173
 174     """
 175
 176     def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
 177         """Constructor.
 178
 179         The grammar argument is a grammar.Grammar instance; see the
 180         grammar module for more information.
 181
 182         The parser is not ready yet for parsing; you must call the
 183         setup() method to get it started.
 184
 185         The optional convert argument is a function mapping concrete
 186         syntax tree nodes to abstract syntax tree nodes.  If not
 187         given, no conversion is done and the syntax tree produced is
 188         the concrete syntax tree.  If given, it must be a function of
 189         two arguments, the first being the grammar (a grammar.Grammar
 190         instance), and the second being the concrete syntax tree node
 191         to be converted.  The syntax tree is converted from the bottom
 192         up.
 193
 194         **post-note: the convert argument is ignored since for Black's
 195         usage, convert will always be blib2to3.pytree.convert. Allowing
 196         this to be dynamic hurts mypyc's ability to use early binding.
 197         These docs are left for historical and informational value.
 198
 199         A concrete syntax tree node is a (type, value, context, nodes)
 200         tuple, where type is the node type (a token or symbol number),
 201         value is None for symbols and a string for tokens, context is
 202         None or an opaque value used for error reporting (typically a
 203         (lineno, offset) pair), and nodes is a list of children for
 204         symbols, and None for tokens.
 205
 206         An abstract syntax tree node may be anything; this is entirely
 207         up to the converter function.
 208
 209         """
 210         self.grammar = grammar
 211         # See note in docstring above. TL;DR this is ignored.
 212         self.convert = convert or lam_sub
 213         self.is_backtracking = False
 214         self.last_token: Optional[int] = None
 215
 216     def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
 217         """Prepare for parsing.
 218
 219         This *must* be called before starting to parse.
 220
 221         The optional argument is an alternative start symbol; it
 222         defaults to the grammar's start symbol.
 223
 224         You can use a Parser instance to parse any number of programs;
 225         each time you call setup() the parser is reset to an initial
 226         state determined by the (implicit or explicit) start symbol.
 227
 228         """
 229         if start is None:
 230             start = self.grammar.start
 231         # Each stack entry is a tuple: (dfa, state, node).
 232         # A node is a tuple: (type, value, context, children),
 233         # where children is a list of nodes or None, and context may be None.
 234         newnode: RawNode = (start, None, None, [])
 235         stackentry = (self.grammar.dfas[start], 0, newnode)
 236         self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
 237         self.rootnode: Optional[NL] = None
 238         self.used_names: Set[str] = set()
 239         self.proxy = proxy
 240         self.last_token = None
 241
 242     def addtoken(self, type: int, value: str, context: Context) -> bool:
 243         """Add a token; return True iff this is the end of the program."""
 244         # Map from token to label
 245         ilabels = self.classify(type, value, context)
 246         assert len(ilabels) >= 1
 247
 248         # If we have only one state to advance, we'll directly
 249         # take it as is.
 250         if len(ilabels) == 1:
 251             [ilabel] = ilabels
 252             return self._addtoken(ilabel, type, value, context)
 253
 254         # If there are multiple states which we can advance (only
 255         # happen under soft-keywords), then we will try all of them
 256         # in parallel and as soon as one state can reach further than
 257         # the rest, we'll choose that one. This is a pretty hacky
 258         # and hopefully temporary algorithm.
 259         #
 260         # For a more detailed explanation, check out this post:
 261         # https://tree.science/what-the-backtracking.html
 262
 263         with self.proxy.release() as proxy:
 264             counter, force = 0, False
 265             recorder = Recorder(self, ilabels, context)
 266             recorder.add_token(type, value, raw=True)
 267
 268             next_token_value = value
 269             while recorder.determine_route(next_token_value) is None:
 270                 if not proxy.can_advance(counter):
 271                     force = True
 272                     break
 273
 274                 next_token_type, next_token_value, *_ = proxy.eat(counter)
 275                 if next_token_type in (tokenize.COMMENT, tokenize.NL):
 276                     counter += 1
 277                     continue
 278
 279                 if next_token_type == tokenize.OP:
 280                     next_token_type = grammar.opmap[next_token_value]
 281
 282                 recorder.add_token(next_token_type, next_token_value)
 283                 counter += 1
 284
 285             ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
 286             assert ilabel is not None
 287
 288         return self._addtoken(ilabel, type, value, context)
 289
 290     def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
 291         # Loop until the token is shifted; may raise exceptions
 292         while True:
 293             dfa, state, node = self.stack[-1]
 294             states, first = dfa
 295             arcs = states[state]
 296             # Look for a state with this label
 297             for i, newstate in arcs:
 298                 t = self.grammar.labels[i][0]
 299                 if t >= 256:
 300                     # See if it's a symbol and if we're in its first set
 301                     itsdfa = self.grammar.dfas[t]
 302                     itsstates, itsfirst = itsdfa
 303                     if ilabel in itsfirst:
 304                         # Push a symbol
 305                         self.push(t, itsdfa, newstate, context)
 306                         break  # To continue the outer while loop
 307
 308                 elif ilabel == i:
 309                     # Look it up in the list of labels
 310                     # Shift a token; we're done with it
 311                     self.shift(type, value, newstate, context)
 312                     # Pop while we are in an accept-only state
 313                     state = newstate
 314                     while states[state] == [(0, state)]:
 315                         self.pop()
 316                         if not self.stack:
 317                             # Done parsing!
 318                             return True
 319                         dfa, state, node = self.stack[-1]
 320                         states, first = dfa
 321                     # Done with this token
 322                     self.last_token = type
 323                     return False
 324
 325             else:
 326                 if (0, state) in arcs:
 327                     # An accepting state, pop it and try something else
 328                     self.pop()
 329                     if not self.stack:
 330                         # Done parsing, but another token is input
 331                         raise ParseError("too much input", type, value, context)
 332                 else:
 333                     # No success finding a transition
 334                     raise ParseError("bad input", type, value, context)
 335
 336     def classify(self, type: int, value: str, context: Context) -> List[int]:
 337         """Turn a token into a label.  (Internal)
 338
 339         Depending on whether the value is a soft-keyword or not,
 340         this function may return multiple labels to choose from."""
 341         if type == token.NAME:
 342             # Keep a listing of all used names
 343             self.used_names.add(value)
 344             # Check for reserved words
 345             if value in self.grammar.keywords:
 346                 return [self.grammar.keywords[value]]
 347             elif value in self.grammar.soft_keywords:
 348                 assert type in self.grammar.tokens
 349                 # Current soft keywords (match, case, type) can only appear at the
 350                 # beginning of a statement. So as a shortcut, don't try to treat them
 351                 # like keywords in any other context.
 352                 # ('_' is also a soft keyword in the real grammar, but for our grammar
 353                 # it's just an expression, so we don't need to treat it specially.)
 354                 if self.last_token not in (
 355                     None,
 356                     token.INDENT,
 357                     token.DEDENT,
 358                     token.NEWLINE,
 359                     token.SEMI,
 360                     token.COLON,
 361                 ):
 362                     return [self.grammar.tokens[type]]
 363                 return [
 364                     self.grammar.tokens[type],
 365                     self.grammar.soft_keywords[value],
 366                 ]
 367
 368         ilabel = self.grammar.tokens.get(type)
 369         if ilabel is None:
 370             raise ParseError("bad token", type, value, context)
 371         return [ilabel]
 372
 373     def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
 374         """Shift a token.  (Internal)"""
 375         if self.is_backtracking:
 376             dfa, state, _ = self.stack[-1]
 377             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 378         else:
 379             dfa, state, node = self.stack[-1]
 380             rawnode: RawNode = (type, value, context, None)
 381             newnode = convert(self.grammar, rawnode)
 382             assert node[-1] is not None
 383             node[-1].append(newnode)
 384             self.stack[-1] = (dfa, newstate, node)
 385
 386     def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
 387         """Push a nonterminal.  (Internal)"""
 388         if self.is_backtracking:
 389             dfa, state, _ = self.stack[-1]
 390             self.stack[-1] = (dfa, newstate, DUMMY_NODE)
 391             self.stack.append((newdfa, 0, DUMMY_NODE))
 392         else:
 393             dfa, state, node = self.stack[-1]
 394             newnode: RawNode = (type, None, context, [])
 395             self.stack[-1] = (dfa, newstate, node)
 396             self.stack.append((newdfa, 0, newnode))
 397
 398     def pop(self) -> None:
 399         """Pop a nonterminal.  (Internal)"""
 400         if self.is_backtracking:
 401             self.stack.pop()
 402         else:
 403             popdfa, popstate, popnode = self.stack.pop()
 404             newnode = convert(self.grammar, popnode)
 405             if self.stack:
 406                 dfa, state, node = self.stack[-1]
 407                 assert node[-1] is not None
 408                 node[-1].append(newnode)
 409             else:
 410                 self.rootnode = newnode
 411                 self.rootnode.used_names = self.used_names