All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
12 from contextlib import contextmanager
27 from blib2to3.pgen2.grammar import Grammar
28 from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
31 from . import grammar, token, tokenize
34 from blib2to3.pgen2.driver import TokenProxy
37 Results = Dict[str, NL]
38 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
39 DFA = List[List[Tuple[int, int]]]
40 DFAS = Tuple[DFA, Dict[int, int]]
43 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
44 assert node[3] is not None
45 return Node(type=node[0], children=node[3], context=node[2])
48 # A placeholder node, used when parser is backtracking.
49 DUMMY_NODE = (-1, None, None, None)
53 stack: List[Tuple[DFAS, int, RawNode]]
54 ) -> List[Tuple[DFAS, int, RawNode]]:
55 """Nodeless stack copy."""
56 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
60 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
62 self._ilabels = ilabels
63 self.context = context # not really matter
65 self._dead_ilabels: Set[int] = set()
66 self._start_point = self.parser.stack
67 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
70 def ilabels(self) -> Set[int]:
71 return self._dead_ilabels.symmetric_difference(self._ilabels)
74 def switch_to(self, ilabel: int) -> Iterator[None]:
75 with self.backtrack():
76 self.parser.stack = self._points[ilabel]
80 self._dead_ilabels.add(ilabel)
82 self.parser.stack = self._start_point
85 def backtrack(self) -> Iterator[None]:
87 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
88 These still will operate on the stack; but they won't create any new nodes, or
89 modify the contents of any other existing nodes.
91 This saves us a ton of time when we are backtracking, since we
92 want to restore to the initial state as quick as possible, which
93 can only be done by having as little mutatations as possible.
95 is_backtracking = self.parser.is_backtracking
97 self.parser.is_backtracking = True
100 self.parser.is_backtracking = is_backtracking
102 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
103 func: Callable[..., Any]
105 func = self.parser._addtoken
107 func = self.parser.addtoken
109 for ilabel in self.ilabels:
110 with self.switch_to(ilabel):
111 args = [tok_type, tok_val, self.context]
113 args.insert(0, ilabel)
117 self, value: Optional[str] = None, force: bool = False
119 alive_ilabels = self.ilabels
120 if len(alive_ilabels) == 0:
121 *_, most_successful_ilabel = self._dead_ilabels
122 raise ParseError("bad input", most_successful_ilabel, value, self.context)
124 ilabel, *rest = alive_ilabels
125 if force or not rest:
131 class ParseError(Exception):
132 """Exception to signal the parser is stuck."""
135 self, msg: str, type: Optional[int], value: Optional[str], context: Context
138 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
143 self.context = context
149 The proper usage sequence is:
151 p = Parser(grammar, [converter]) # create instance
152 p.setup([start]) # prepare for parsing
153 <for each input token>:
154 if p.addtoken(...): # parse a token; may raise ParseError
156 root = p.rootnode # root of abstract syntax tree
158 A Parser instance may be reused by calling setup() repeatedly.
160 A Parser instance contains state pertaining to the current token
161 sequence, and should not be used concurrently by different threads
162 to parse separate token sequences.
164 See driver.py for how to get input tokens by tokenizing a file or
167 Parsing is complete when addtoken() returns True; the root of the
168 abstract syntax tree can then be retrieved from the rootnode
169 instance variable. When a syntax error occurs, addtoken() raises
170 the ParseError exception. There is no error recovery; the parser
171 cannot be used after a syntax error was reported (but it can be
172 reinitialized by calling setup()).
176 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
179 The grammar argument is a grammar.Grammar instance; see the
180 grammar module for more information.
182 The parser is not ready yet for parsing; you must call the
183 setup() method to get it started.
185 The optional convert argument is a function mapping concrete
186 syntax tree nodes to abstract syntax tree nodes. If not
187 given, no conversion is done and the syntax tree produced is
188 the concrete syntax tree. If given, it must be a function of
189 two arguments, the first being the grammar (a grammar.Grammar
190 instance), and the second being the concrete syntax tree node
191 to be converted. The syntax tree is converted from the bottom
194 **post-note: the convert argument is ignored since for Black's
195 usage, convert will always be blib2to3.pytree.convert. Allowing
196 this to be dynamic hurts mypyc's ability to use early binding.
197 These docs are left for historical and informational value.
199 A concrete syntax tree node is a (type, value, context, nodes)
200 tuple, where type is the node type (a token or symbol number),
201 value is None for symbols and a string for tokens, context is
202 None or an opaque value used for error reporting (typically a
203 (lineno, offset) pair), and nodes is a list of children for
204 symbols, and None for tokens.
206 An abstract syntax tree node may be anything; this is entirely
207 up to the converter function.
210 self.grammar = grammar
211 # See note in docstring above. TL;DR this is ignored.
212 self.convert = convert or lam_sub
213 self.is_backtracking = False
214 self.last_token: Optional[int] = None
216 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
217 """Prepare for parsing.
219 This *must* be called before starting to parse.
221 The optional argument is an alternative start symbol; it
222 defaults to the grammar's start symbol.
224 You can use a Parser instance to parse any number of programs;
225 each time you call setup() the parser is reset to an initial
226 state determined by the (implicit or explicit) start symbol.
230 start = self.grammar.start
231 # Each stack entry is a tuple: (dfa, state, node).
232 # A node is a tuple: (type, value, context, children),
233 # where children is a list of nodes or None, and context may be None.
234 newnode: RawNode = (start, None, None, [])
235 stackentry = (self.grammar.dfas[start], 0, newnode)
236 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
237 self.rootnode: Optional[NL] = None
238 self.used_names: Set[str] = set()
240 self.last_token = None
242 def addtoken(self, type: int, value: str, context: Context) -> bool:
243 """Add a token; return True iff this is the end of the program."""
244 # Map from token to label
245 ilabels = self.classify(type, value, context)
246 assert len(ilabels) >= 1
248 # If we have only one state to advance, we'll directly
250 if len(ilabels) == 1:
252 return self._addtoken(ilabel, type, value, context)
254 # If there are multiple states which we can advance (only
255 # happen under soft-keywords), then we will try all of them
256 # in parallel and as soon as one state can reach further than
257 # the rest, we'll choose that one. This is a pretty hacky
258 # and hopefully temporary algorithm.
260 # For a more detailed explanation, check out this post:
261 # https://tree.science/what-the-backtracking.html
263 with self.proxy.release() as proxy:
264 counter, force = 0, False
265 recorder = Recorder(self, ilabels, context)
266 recorder.add_token(type, value, raw=True)
268 next_token_value = value
269 while recorder.determine_route(next_token_value) is None:
270 if not proxy.can_advance(counter):
274 next_token_type, next_token_value, *_ = proxy.eat(counter)
275 if next_token_type in (tokenize.COMMENT, tokenize.NL):
279 if next_token_type == tokenize.OP:
280 next_token_type = grammar.opmap[next_token_value]
282 recorder.add_token(next_token_type, next_token_value)
285 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
286 assert ilabel is not None
288 return self._addtoken(ilabel, type, value, context)
290 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
291 # Loop until the token is shifted; may raise exceptions
293 dfa, state, node = self.stack[-1]
296 # Look for a state with this label
297 for i, newstate in arcs:
298 t = self.grammar.labels[i][0]
300 # See if it's a symbol and if we're in its first set
301 itsdfa = self.grammar.dfas[t]
302 itsstates, itsfirst = itsdfa
303 if ilabel in itsfirst:
305 self.push(t, itsdfa, newstate, context)
306 break # To continue the outer while loop
309 # Look it up in the list of labels
310 # Shift a token; we're done with it
311 self.shift(type, value, newstate, context)
312 # Pop while we are in an accept-only state
314 while states[state] == [(0, state)]:
319 dfa, state, node = self.stack[-1]
321 # Done with this token
322 self.last_token = type
326 if (0, state) in arcs:
327 # An accepting state, pop it and try something else
330 # Done parsing, but another token is input
331 raise ParseError("too much input", type, value, context)
333 # No success finding a transition
334 raise ParseError("bad input", type, value, context)
336 def classify(self, type: int, value: str, context: Context) -> List[int]:
337 """Turn a token into a label. (Internal)
339 Depending on whether the value is a soft-keyword or not,
340 this function may return multiple labels to choose from."""
341 if type == token.NAME:
342 # Keep a listing of all used names
343 self.used_names.add(value)
344 # Check for reserved words
345 if value in self.grammar.keywords:
346 return [self.grammar.keywords[value]]
347 elif value in self.grammar.soft_keywords:
348 assert type in self.grammar.tokens
349 # Current soft keywords (match, case, type) can only appear at the
350 # beginning of a statement. So as a shortcut, don't try to treat them
351 # like keywords in any other context.
352 # ('_' is also a soft keyword in the real grammar, but for our grammar
353 # it's just an expression, so we don't need to treat it specially.)
354 if self.last_token not in (
362 return [self.grammar.tokens[type]]
364 self.grammar.tokens[type],
365 self.grammar.soft_keywords[value],
368 ilabel = self.grammar.tokens.get(type)
370 raise ParseError("bad token", type, value, context)
373 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
374 """Shift a token. (Internal)"""
375 if self.is_backtracking:
376 dfa, state, _ = self.stack[-1]
377 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
379 dfa, state, node = self.stack[-1]
380 rawnode: RawNode = (type, value, context, None)
381 newnode = convert(self.grammar, rawnode)
382 assert node[-1] is not None
383 node[-1].append(newnode)
384 self.stack[-1] = (dfa, newstate, node)
386 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
387 """Push a nonterminal. (Internal)"""
388 if self.is_backtracking:
389 dfa, state, _ = self.stack[-1]
390 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
391 self.stack.append((newdfa, 0, DUMMY_NODE))
393 dfa, state, node = self.stack[-1]
394 newnode: RawNode = (type, None, context, [])
395 self.stack[-1] = (dfa, newstate, node)
396 self.stack.append((newdfa, 0, newnode))
398 def pop(self) -> None:
399 """Pop a nonterminal. (Internal)"""
400 if self.is_backtracking:
403 popdfa, popstate, popnode = self.stack.pop()
404 newnode = convert(self.grammar, popnode)
406 dfa, state, node = self.stack[-1]
407 assert node[-1] is not None
408 node[-1].append(newnode)
410 self.rootnode = newnode
411 self.rootnode.used_names = self.used_names