All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
12 from contextlib import contextmanager
15 from . import grammar, token, tokenize
29 from blib2to3.pgen2.grammar import Grammar
30 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
33 from blib2to3.pgen2.driver import TokenProxy
36 Results = Dict[str, NL]
37 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
38 DFA = List[List[Tuple[int, int]]]
39 DFAS = Tuple[DFA, Dict[int, int]]
42 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
43 assert node[3] is not None
44 return Node(type=node[0], children=node[3], context=node[2])
47 # A placeholder node, used when parser is backtracking.
48 DUMMY_NODE = (-1, None, None, None)
52 stack: List[Tuple[DFAS, int, RawNode]]
53 ) -> List[Tuple[DFAS, int, RawNode]]:
54 """Nodeless stack copy."""
55 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
59 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
61 self._ilabels = ilabels
62 self.context = context # not really matter
64 self._dead_ilabels: Set[int] = set()
65 self._start_point = self.parser.stack
66 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
69 def ilabels(self) -> Set[int]:
70 return self._dead_ilabels.symmetric_difference(self._ilabels)
73 def switch_to(self, ilabel: int) -> Iterator[None]:
74 with self.backtrack():
75 self.parser.stack = self._points[ilabel]
79 self._dead_ilabels.add(ilabel)
81 self.parser.stack = self._start_point
84 def backtrack(self) -> Iterator[None]:
86 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
87 These still will operate on the stack; but they won't create any new nodes, or
88 modify the contents of any other existing nodes.
90 This saves us a ton of time when we are backtracking, since we
91 want to restore to the initial state as quick as possible, which
92 can only be done by having as little mutatations as possible.
94 is_backtracking = self.parser.is_backtracking
96 self.parser.is_backtracking = True
99 self.parser.is_backtracking = is_backtracking
101 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
102 func: Callable[..., Any]
104 func = self.parser._addtoken
106 func = self.parser.addtoken
108 for ilabel in self.ilabels:
109 with self.switch_to(ilabel):
110 args = [tok_type, tok_val, self.context]
112 args.insert(0, ilabel)
115 def determine_route(self, value: Optional[str] = None, force: bool = False) -> Optional[int]:
116 alive_ilabels = self.ilabels
117 if len(alive_ilabels) == 0:
118 *_, most_successful_ilabel = self._dead_ilabels
119 raise ParseError("bad input", most_successful_ilabel, value, self.context)
121 ilabel, *rest = alive_ilabels
122 if force or not rest:
128 class ParseError(Exception):
129 """Exception to signal the parser is stuck."""
132 self, msg: str, type: Optional[int], value: Optional[str], context: Context
135 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
140 self.context = context
146 The proper usage sequence is:
148 p = Parser(grammar, [converter]) # create instance
149 p.setup([start]) # prepare for parsing
150 <for each input token>:
151 if p.addtoken(...): # parse a token; may raise ParseError
153 root = p.rootnode # root of abstract syntax tree
155 A Parser instance may be reused by calling setup() repeatedly.
157 A Parser instance contains state pertaining to the current token
158 sequence, and should not be used concurrently by different threads
159 to parse separate token sequences.
161 See driver.py for how to get input tokens by tokenizing a file or
164 Parsing is complete when addtoken() returns True; the root of the
165 abstract syntax tree can then be retrieved from the rootnode
166 instance variable. When a syntax error occurs, addtoken() raises
167 the ParseError exception. There is no error recovery; the parser
168 cannot be used after a syntax error was reported (but it can be
169 reinitialized by calling setup()).
173 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
176 The grammar argument is a grammar.Grammar instance; see the
177 grammar module for more information.
179 The parser is not ready yet for parsing; you must call the
180 setup() method to get it started.
182 The optional convert argument is a function mapping concrete
183 syntax tree nodes to abstract syntax tree nodes. If not
184 given, no conversion is done and the syntax tree produced is
185 the concrete syntax tree. If given, it must be a function of
186 two arguments, the first being the grammar (a grammar.Grammar
187 instance), and the second being the concrete syntax tree node
188 to be converted. The syntax tree is converted from the bottom
191 **post-note: the convert argument is ignored since for Black's
192 usage, convert will always be blib2to3.pytree.convert. Allowing
193 this to be dynamic hurts mypyc's ability to use early binding.
194 These docs are left for historical and informational value.
196 A concrete syntax tree node is a (type, value, context, nodes)
197 tuple, where type is the node type (a token or symbol number),
198 value is None for symbols and a string for tokens, context is
199 None or an opaque value used for error reporting (typically a
200 (lineno, offset) pair), and nodes is a list of children for
201 symbols, and None for tokens.
203 An abstract syntax tree node may be anything; this is entirely
204 up to the converter function.
207 self.grammar = grammar
208 # See note in docstring above. TL;DR this is ignored.
209 self.convert = convert or lam_sub
210 self.is_backtracking = False
212 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
213 """Prepare for parsing.
215 This *must* be called before starting to parse.
217 The optional argument is an alternative start symbol; it
218 defaults to the grammar's start symbol.
220 You can use a Parser instance to parse any number of programs;
221 each time you call setup() the parser is reset to an initial
222 state determined by the (implicit or explicit) start symbol.
226 start = self.grammar.start
227 # Each stack entry is a tuple: (dfa, state, node).
228 # A node is a tuple: (type, value, context, children),
229 # where children is a list of nodes or None, and context may be None.
230 newnode: RawNode = (start, None, None, [])
231 stackentry = (self.grammar.dfas[start], 0, newnode)
232 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
233 self.rootnode: Optional[NL] = None
234 self.used_names: Set[str] = set()
237 def addtoken(self, type: int, value: str, context: Context) -> bool:
238 """Add a token; return True iff this is the end of the program."""
239 # Map from token to label
240 ilabels = self.classify(type, value, context)
241 assert len(ilabels) >= 1
243 # If we have only one state to advance, we'll directly
245 if len(ilabels) == 1:
247 return self._addtoken(ilabel, type, value, context)
249 # If there are multiple states which we can advance (only
250 # happen under soft-keywords), then we will try all of them
251 # in parallel and as soon as one state can reach further than
252 # the rest, we'll choose that one. This is a pretty hacky
253 # and hopefully temporary algorithm.
255 # For a more detailed explanation, check out this post:
256 # https://tree.science/what-the-backtracking.html
258 with self.proxy.release() as proxy:
259 counter, force = 0, False
260 recorder = Recorder(self, ilabels, context)
261 recorder.add_token(type, value, raw=True)
263 next_token_value = value
264 while recorder.determine_route(next_token_value) is None:
265 if not proxy.can_advance(counter):
269 next_token_type, next_token_value, *_ = proxy.eat(counter)
270 if next_token_type in (tokenize.COMMENT, tokenize.NL):
274 if next_token_type == tokenize.OP:
275 next_token_type = grammar.opmap[next_token_value]
277 recorder.add_token(next_token_type, next_token_value)
280 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
281 assert ilabel is not None
283 return self._addtoken(ilabel, type, value, context)
285 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
286 # Loop until the token is shifted; may raise exceptions
288 dfa, state, node = self.stack[-1]
291 # Look for a state with this label
292 for i, newstate in arcs:
293 t = self.grammar.labels[i][0]
295 # See if it's a symbol and if we're in its first set
296 itsdfa = self.grammar.dfas[t]
297 itsstates, itsfirst = itsdfa
298 if ilabel in itsfirst:
300 self.push(t, itsdfa, newstate, context)
301 break # To continue the outer while loop
304 # Look it up in the list of labels
305 # Shift a token; we're done with it
306 self.shift(type, value, newstate, context)
307 # Pop while we are in an accept-only state
309 while states[state] == [(0, state)]:
314 dfa, state, node = self.stack[-1]
316 # Done with this token
320 if (0, state) in arcs:
321 # An accepting state, pop it and try something else
324 # Done parsing, but another token is input
325 raise ParseError("too much input", type, value, context)
327 # No success finding a transition
328 raise ParseError("bad input", type, value, context)
330 def classify(self, type: int, value: str, context: Context) -> List[int]:
331 """Turn a token into a label. (Internal)
333 Depending on whether the value is a soft-keyword or not,
334 this function may return multiple labels to choose from."""
335 if type == token.NAME:
336 # Keep a listing of all used names
337 self.used_names.add(value)
338 # Check for reserved words
339 if value in self.grammar.keywords:
340 return [self.grammar.keywords[value]]
341 elif value in self.grammar.soft_keywords:
342 assert type in self.grammar.tokens
344 self.grammar.soft_keywords[value],
345 self.grammar.tokens[type],
348 ilabel = self.grammar.tokens.get(type)
350 raise ParseError("bad token", type, value, context)
353 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
354 """Shift a token. (Internal)"""
355 if self.is_backtracking:
356 dfa, state, _ = self.stack[-1]
357 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
359 dfa, state, node = self.stack[-1]
360 rawnode: RawNode = (type, value, context, None)
361 newnode = convert(self.grammar, rawnode)
362 assert node[-1] is not None
363 node[-1].append(newnode)
364 self.stack[-1] = (dfa, newstate, node)
366 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
367 """Push a nonterminal. (Internal)"""
368 if self.is_backtracking:
369 dfa, state, _ = self.stack[-1]
370 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
371 self.stack.append((newdfa, 0, DUMMY_NODE))
373 dfa, state, node = self.stack[-1]
374 newnode: RawNode = (type, None, context, [])
375 self.stack[-1] = (dfa, newstate, node)
376 self.stack.append((newdfa, 0, newnode))
378 def pop(self) -> None:
379 """Pop a nonterminal. (Internal)"""
380 if self.is_backtracking:
383 popdfa, popstate, popnode = self.stack.pop()
384 newnode = convert(self.grammar, popnode)
386 dfa, state, node = self.stack[-1]
387 assert node[-1] is not None
388 node[-1].append(newnode)
390 self.rootnode = newnode
391 self.rootnode.used_names = self.used_names