All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
13 from contextlib import contextmanager
16 from . import grammar, token, tokenize
31 from blib2to3.pgen2.grammar import Grammar
32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
35 from blib2to3.driver import TokenProxy
38 Results = Dict[Text, NL]
39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
40 DFA = List[List[Tuple[int, int]]]
41 DFAS = Tuple[DFA, Dict[int, int]]
44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
45 assert node[3] is not None
46 return Node(type=node[0], children=node[3], context=node[2])
49 # A placeholder node, used when parser is backtracking.
50 DUMMY_NODE = (-1, None, None, None)
54 stack: List[Tuple[DFAS, int, RawNode]]
55 ) -> List[Tuple[DFAS, int, RawNode]]:
56 """Nodeless stack copy."""
57 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
61 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
63 self._ilabels = ilabels
64 self.context = context # not really matter
66 self._dead_ilabels: Set[int] = set()
67 self._start_point = self.parser.stack
68 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
71 def ilabels(self) -> Set[int]:
72 return self._dead_ilabels.symmetric_difference(self._ilabels)
75 def switch_to(self, ilabel: int) -> Iterator[None]:
76 with self.backtrack():
77 self.parser.stack = self._points[ilabel]
81 self._dead_ilabels.add(ilabel)
83 self.parser.stack = self._start_point
86 def backtrack(self) -> Iterator[None]:
88 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
89 These still will operate on the stack; but they won't create any new nodes, or
90 modify the contents of any other existing nodes.
92 This saves us a ton of time when we are backtracking, since we
93 want to restore to the initial state as quick as possible, which
94 can only be done by having as little mutatations as possible.
96 is_backtracking = self.parser.is_backtracking
98 self.parser.is_backtracking = True
101 self.parser.is_backtracking = is_backtracking
103 def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
104 func: Callable[..., Any]
106 func = self.parser._addtoken
108 func = self.parser.addtoken
110 for ilabel in self.ilabels:
111 with self.switch_to(ilabel):
112 args = [tok_type, tok_val, self.context]
114 args.insert(0, ilabel)
117 def determine_route(self, value: Optional[Text] = None, force: bool = False) -> Optional[int]:
118 alive_ilabels = self.ilabels
119 if len(alive_ilabels) == 0:
120 *_, most_successful_ilabel = self._dead_ilabels
121 raise ParseError("bad input", most_successful_ilabel, value, self.context)
123 ilabel, *rest = alive_ilabels
124 if force or not rest:
130 class ParseError(Exception):
131 """Exception to signal the parser is stuck."""
134 self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
137 self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
142 self.context = context
145 class Parser(object):
148 The proper usage sequence is:
150 p = Parser(grammar, [converter]) # create instance
151 p.setup([start]) # prepare for parsing
152 <for each input token>:
153 if p.addtoken(...): # parse a token; may raise ParseError
155 root = p.rootnode # root of abstract syntax tree
157 A Parser instance may be reused by calling setup() repeatedly.
159 A Parser instance contains state pertaining to the current token
160 sequence, and should not be used concurrently by different threads
161 to parse separate token sequences.
163 See driver.py for how to get input tokens by tokenizing a file or
166 Parsing is complete when addtoken() returns True; the root of the
167 abstract syntax tree can then be retrieved from the rootnode
168 instance variable. When a syntax error occurs, addtoken() raises
169 the ParseError exception. There is no error recovery; the parser
170 cannot be used after a syntax error was reported (but it can be
171 reinitialized by calling setup()).
175 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
178 The grammar argument is a grammar.Grammar instance; see the
179 grammar module for more information.
181 The parser is not ready yet for parsing; you must call the
182 setup() method to get it started.
184 The optional convert argument is a function mapping concrete
185 syntax tree nodes to abstract syntax tree nodes. If not
186 given, no conversion is done and the syntax tree produced is
187 the concrete syntax tree. If given, it must be a function of
188 two arguments, the first being the grammar (a grammar.Grammar
189 instance), and the second being the concrete syntax tree node
190 to be converted. The syntax tree is converted from the bottom
193 **post-note: the convert argument is ignored since for Black's
194 usage, convert will always be blib2to3.pytree.convert. Allowing
195 this to be dynamic hurts mypyc's ability to use early binding.
196 These docs are left for historical and informational value.
198 A concrete syntax tree node is a (type, value, context, nodes)
199 tuple, where type is the node type (a token or symbol number),
200 value is None for symbols and a string for tokens, context is
201 None or an opaque value used for error reporting (typically a
202 (lineno, offset) pair), and nodes is a list of children for
203 symbols, and None for tokens.
205 An abstract syntax tree node may be anything; this is entirely
206 up to the converter function.
209 self.grammar = grammar
210 # See note in docstring above. TL;DR this is ignored.
211 self.convert = convert or lam_sub
212 self.is_backtracking = False
214 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
215 """Prepare for parsing.
217 This *must* be called before starting to parse.
219 The optional argument is an alternative start symbol; it
220 defaults to the grammar's start symbol.
222 You can use a Parser instance to parse any number of programs;
223 each time you call setup() the parser is reset to an initial
224 state determined by the (implicit or explicit) start symbol.
228 start = self.grammar.start
229 # Each stack entry is a tuple: (dfa, state, node).
230 # A node is a tuple: (type, value, context, children),
231 # where children is a list of nodes or None, and context may be None.
232 newnode: RawNode = (start, None, None, [])
233 stackentry = (self.grammar.dfas[start], 0, newnode)
234 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
235 self.rootnode: Optional[NL] = None
236 self.used_names: Set[str] = set()
239 def addtoken(self, type: int, value: Text, context: Context) -> bool:
240 """Add a token; return True iff this is the end of the program."""
241 # Map from token to label
242 ilabels = self.classify(type, value, context)
243 assert len(ilabels) >= 1
245 # If we have only one state to advance, we'll directly
247 if len(ilabels) == 1:
249 return self._addtoken(ilabel, type, value, context)
251 # If there are multiple states which we can advance (only
252 # happen under soft-keywords), then we will try all of them
253 # in parallel and as soon as one state can reach further than
254 # the rest, we'll choose that one. This is a pretty hacky
255 # and hopefully temporary algorithm.
257 # For a more detailed explanation, check out this post:
258 # https://tree.science/what-the-backtracking.html
260 with self.proxy.release() as proxy:
261 counter, force = 0, False
262 recorder = Recorder(self, ilabels, context)
263 recorder.add_token(type, value, raw=True)
265 next_token_value = value
266 while recorder.determine_route(next_token_value) is None:
267 if not proxy.can_advance(counter):
271 next_token_type, next_token_value, *_ = proxy.eat(counter)
272 if next_token_type in (tokenize.COMMENT, tokenize.NL):
276 if next_token_type == tokenize.OP:
277 next_token_type = grammar.opmap[next_token_value]
279 recorder.add_token(next_token_type, next_token_value)
282 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
283 assert ilabel is not None
285 return self._addtoken(ilabel, type, value, context)
287 def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
288 # Loop until the token is shifted; may raise exceptions
290 dfa, state, node = self.stack[-1]
293 # Look for a state with this label
294 for i, newstate in arcs:
295 t = self.grammar.labels[i][0]
297 # See if it's a symbol and if we're in its first set
298 itsdfa = self.grammar.dfas[t]
299 itsstates, itsfirst = itsdfa
300 if ilabel in itsfirst:
302 self.push(t, itsdfa, newstate, context)
303 break # To continue the outer while loop
306 # Look it up in the list of labels
307 # Shift a token; we're done with it
308 self.shift(type, value, newstate, context)
309 # Pop while we are in an accept-only state
311 while states[state] == [(0, state)]:
316 dfa, state, node = self.stack[-1]
318 # Done with this token
322 if (0, state) in arcs:
323 # An accepting state, pop it and try something else
326 # Done parsing, but another token is input
327 raise ParseError("too much input", type, value, context)
329 # No success finding a transition
330 raise ParseError("bad input", type, value, context)
332 def classify(self, type: int, value: Text, context: Context) -> List[int]:
333 """Turn a token into a label. (Internal)
335 Depending on whether the value is a soft-keyword or not,
336 this function may return multiple labels to choose from."""
337 if type == token.NAME:
338 # Keep a listing of all used names
339 self.used_names.add(value)
340 # Check for reserved words
341 if value in self.grammar.keywords:
342 return [self.grammar.keywords[value]]
343 elif value in self.grammar.soft_keywords:
344 assert type in self.grammar.tokens
346 self.grammar.soft_keywords[value],
347 self.grammar.tokens[type],
350 ilabel = self.grammar.tokens.get(type)
352 raise ParseError("bad token", type, value, context)
355 def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
356 """Shift a token. (Internal)"""
357 if self.is_backtracking:
358 dfa, state, _ = self.stack[-1]
359 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
361 dfa, state, node = self.stack[-1]
362 rawnode: RawNode = (type, value, context, None)
363 newnode = convert(self.grammar, rawnode)
364 assert node[-1] is not None
365 node[-1].append(newnode)
366 self.stack[-1] = (dfa, newstate, node)
368 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
369 """Push a nonterminal. (Internal)"""
370 if self.is_backtracking:
371 dfa, state, _ = self.stack[-1]
372 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
373 self.stack.append((newdfa, 0, DUMMY_NODE))
375 dfa, state, node = self.stack[-1]
376 newnode: RawNode = (type, None, context, [])
377 self.stack[-1] = (dfa, newstate, node)
378 self.stack.append((newdfa, 0, newnode))
380 def pop(self) -> None:
381 """Pop a nonterminal. (Internal)"""
382 if self.is_backtracking:
385 popdfa, popstate, popnode = self.stack.pop()
386 newnode = convert(self.grammar, popnode)
388 dfa, state, node = self.stack[-1]
389 assert node[-1] is not None
390 node[-1].append(newnode)
392 self.rootnode = newnode
393 self.rootnode.used_names = self.used_names