All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
13 from contextlib import contextmanager
16 from . import grammar, token, tokenize
31 from blib2to3.pgen2.grammar import Grammar
32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
35 from blib2to3.driver import TokenProxy
38 Results = Dict[Text, NL]
39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
40 DFA = List[List[Tuple[int, int]]]
41 DFAS = Tuple[DFA, Dict[int, int]]
44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
45 assert node[3] is not None
46 return Node(type=node[0], children=node[3], context=node[2])
49 # A placeholder node, used when parser is backtracking.
50 DUMMY_NODE = (-1, None, None, None)
54 stack: List[Tuple[DFAS, int, RawNode]]
55 ) -> List[Tuple[DFAS, int, RawNode]]:
56 """Nodeless stack copy."""
57 return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack]
61 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
63 self._ilabels = ilabels
64 self.context = context # not really matter
66 self._dead_ilabels: Set[int] = set()
67 self._start_point = self.parser.stack
68 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
71 def ilabels(self) -> Set[int]:
72 return self._dead_ilabels.symmetric_difference(self._ilabels)
75 def switch_to(self, ilabel: int) -> Iterator[None]:
76 with self.backtrack():
77 self.parser.stack = self._points[ilabel]
81 self._dead_ilabels.add(ilabel)
83 self.parser.stack = self._start_point
86 def backtrack(self) -> Iterator[None]:
88 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
89 These still will operate on the stack; but they won't create any new nodes, or
90 modify the contents of any other existing nodes.
92 This saves us a ton of time when we are backtracking, since we
93 want to restore to the initial state as quick as possible, which
94 can only be done by having as little mutatations as possible.
96 is_backtracking = self.parser.is_backtracking
98 self.parser.is_backtracking = True
101 self.parser.is_backtracking = is_backtracking
103 def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
104 func: Callable[..., Any]
106 func = self.parser._addtoken
108 func = self.parser.addtoken
110 for ilabel in self.ilabels:
111 with self.switch_to(ilabel):
112 args = [tok_type, tok_val, self.context]
114 args.insert(0, ilabel)
117 def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
118 alive_ilabels = self.ilabels
119 if len(alive_ilabels) == 0:
120 *_, most_successful_ilabel = self._dead_ilabels
121 raise ParseError("bad input", most_successful_ilabel, value, self.context)
123 ilabel, *rest = alive_ilabels
124 if force or not rest:
130 class ParseError(Exception):
131 """Exception to signal the parser is stuck."""
134 self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
137 self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
142 self.context = context
145 class Parser(object):
148 The proper usage sequence is:
150 p = Parser(grammar, [converter]) # create instance
151 p.setup([start]) # prepare for parsing
152 <for each input token>:
153 if p.addtoken(...): # parse a token; may raise ParseError
155 root = p.rootnode # root of abstract syntax tree
157 A Parser instance may be reused by calling setup() repeatedly.
159 A Parser instance contains state pertaining to the current token
160 sequence, and should not be used concurrently by different threads
161 to parse separate token sequences.
163 See driver.py for how to get input tokens by tokenizing a file or
166 Parsing is complete when addtoken() returns True; the root of the
167 abstract syntax tree can then be retrieved from the rootnode
168 instance variable. When a syntax error occurs, addtoken() raises
169 the ParseError exception. There is no error recovery; the parser
170 cannot be used after a syntax error was reported (but it can be
171 reinitialized by calling setup()).
175 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
178 The grammar argument is a grammar.Grammar instance; see the
179 grammar module for more information.
181 The parser is not ready yet for parsing; you must call the
182 setup() method to get it started.
184 The optional convert argument is a function mapping concrete
185 syntax tree nodes to abstract syntax tree nodes. If not
186 given, no conversion is done and the syntax tree produced is
187 the concrete syntax tree. If given, it must be a function of
188 two arguments, the first being the grammar (a grammar.Grammar
189 instance), and the second being the concrete syntax tree node
190 to be converted. The syntax tree is converted from the bottom
193 **post-note: the convert argument is ignored since for Black's
194 usage, convert will always be blib2to3.pytree.convert. Allowing
195 this to be dynamic hurts mypyc's ability to use early binding.
196 These docs are left for historical and informational value.
198 A concrete syntax tree node is a (type, value, context, nodes)
199 tuple, where type is the node type (a token or symbol number),
200 value is None for symbols and a string for tokens, context is
201 None or an opaque value used for error reporting (typically a
202 (lineno, offset) pair), and nodes is a list of children for
203 symbols, and None for tokens.
205 An abstract syntax tree node may be anything; this is entirely
206 up to the converter function.
209 self.grammar = grammar
210 # See note in docstring above. TL;DR this is ignored.
211 self.convert = convert or lam_sub
212 self.is_backtracking = False
214 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
215 """Prepare for parsing.
217 This *must* be called before starting to parse.
219 The optional argument is an alternative start symbol; it
220 defaults to the grammar's start symbol.
222 You can use a Parser instance to parse any number of programs;
223 each time you call setup() the parser is reset to an initial
224 state determined by the (implicit or explicit) start symbol.
228 start = self.grammar.start
229 # Each stack entry is a tuple: (dfa, state, node).
230 # A node is a tuple: (type, value, context, children),
231 # where children is a list of nodes or None, and context may be None.
232 newnode: RawNode = (start, None, None, [])
233 stackentry = (self.grammar.dfas[start], 0, newnode)
234 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
235 self.rootnode: Optional[NL] = None
236 self.used_names: Set[str] = set()
239 def addtoken(self, type: int, value: Text, context: Context) -> bool:
240 """Add a token; return True iff this is the end of the program."""
241 # Map from token to label
242 ilabels = self.classify(type, value, context)
243 assert len(ilabels) >= 1
245 # If we have only one state to advance, we'll directly
247 if len(ilabels) == 1:
249 return self._addtoken(ilabel, type, value, context)
251 # If there are multiple states which we can advance (only
252 # happen under soft-keywords), then we will try all of them
253 # in parallel and as soon as one state can reach further than
254 # the rest, we'll choose that one. This is a pretty hacky
255 # and hopefully temporary algorithm.
257 # For a more detailed explanation, check out this post:
258 # https://tree.science/what-the-backtracking.html
260 with self.proxy.release() as proxy:
261 counter, force = 0, False
262 recorder = Recorder(self, ilabels, context)
263 recorder.add_token(type, value, raw=True)
265 next_token_value = value
266 while recorder.determine_route(next_token_value) is None:
267 if not proxy.can_advance(counter):
271 next_token_type, next_token_value, *_ = proxy.eat(counter)
272 if next_token_type == tokenize.OP:
273 next_token_type = grammar.opmap[next_token_value]
275 recorder.add_token(next_token_type, next_token_value)
278 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
279 assert ilabel is not None
281 return self._addtoken(ilabel, type, value, context)
283 def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
284 # Loop until the token is shifted; may raise exceptions
286 dfa, state, node = self.stack[-1]
289 # Look for a state with this label
290 for i, newstate in arcs:
291 t = self.grammar.labels[i][0]
293 # See if it's a symbol and if we're in its first set
294 itsdfa = self.grammar.dfas[t]
295 itsstates, itsfirst = itsdfa
296 if ilabel in itsfirst:
298 self.push(t, itsdfa, newstate, context)
299 break # To continue the outer while loop
302 # Look it up in the list of labels
303 # Shift a token; we're done with it
304 self.shift(type, value, newstate, context)
305 # Pop while we are in an accept-only state
307 while states[state] == [(0, state)]:
312 dfa, state, node = self.stack[-1]
314 # Done with this token
318 if (0, state) in arcs:
319 # An accepting state, pop it and try something else
322 # Done parsing, but another token is input
323 raise ParseError("too much input", type, value, context)
325 # No success finding a transition
326 raise ParseError("bad input", type, value, context)
328 def classify(self, type: int, value: Text, context: Context) -> List[int]:
329 """Turn a token into a label. (Internal)
331 Depending on whether the value is a soft-keyword or not,
332 this function may return multiple labels to choose from."""
333 if type == token.NAME:
334 # Keep a listing of all used names
335 self.used_names.add(value)
336 # Check for reserved words
337 if value in self.grammar.keywords:
338 return [self.grammar.keywords[value]]
339 elif value in self.grammar.soft_keywords:
340 assert type in self.grammar.tokens
342 self.grammar.soft_keywords[value],
343 self.grammar.tokens[type],
346 ilabel = self.grammar.tokens.get(type)
348 raise ParseError("bad token", type, value, context)
351 def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
352 """Shift a token. (Internal)"""
353 if self.is_backtracking:
354 dfa, state, _ = self.stack[-1]
355 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
357 dfa, state, node = self.stack[-1]
358 rawnode: RawNode = (type, value, context, None)
359 newnode = convert(self.grammar, rawnode)
360 assert node[-1] is not None
361 node[-1].append(newnode)
362 self.stack[-1] = (dfa, newstate, node)
364 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
365 """Push a nonterminal. (Internal)"""
366 if self.is_backtracking:
367 dfa, state, _ = self.stack[-1]
368 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
369 self.stack.append((newdfa, 0, DUMMY_NODE))
371 dfa, state, node = self.stack[-1]
372 newnode: RawNode = (type, None, context, [])
373 self.stack[-1] = (dfa, newstate, node)
374 self.stack.append((newdfa, 0, newnode))
376 def pop(self) -> None:
377 """Pop a nonterminal. (Internal)"""
378 if self.is_backtracking:
381 popdfa, popstate, popnode = self.stack.pop()
382 newnode = convert(self.grammar, popnode)
384 dfa, state, node = self.stack[-1]
385 assert node[-1] is not None
386 node[-1].append(newnode)
388 self.rootnode = newnode
389 self.rootnode.used_names = self.used_names