All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
13 from contextlib import contextmanager
16 from . import grammar, token, tokenize
31 from blib2to3.pgen2.grammar import Grammar
32 from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
35 from blib2to3.driver import TokenProxy
38 Results = Dict[Text, NL]
39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
40 DFA = List[List[Tuple[int, int]]]
41 DFAS = Tuple[DFA, Dict[int, int]]
44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
45 assert node[3] is not None
46 return Node(type=node[0], children=node[3], context=node[2])
50 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
52 self._ilabels = ilabels
53 self.context = context # not really matter
55 self._dead_ilabels: Set[int] = set()
56 self._start_point = copy.deepcopy(self.parser.stack)
57 self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
60 def ilabels(self) -> Set[int]:
61 return self._dead_ilabels.symmetric_difference(self._ilabels)
64 def switch_to(self, ilabel: int) -> Iterator[None]:
65 self.parser.stack = self._points[ilabel]
69 self._dead_ilabels.add(ilabel)
71 self.parser.stack = self._start_point
73 def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
74 func: Callable[..., Any]
76 func = self.parser._addtoken
78 func = self.parser.addtoken
80 for ilabel in self.ilabels:
81 with self.switch_to(ilabel):
82 args = [tok_type, tok_val, self.context]
84 args.insert(0, ilabel)
87 def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
88 alive_ilabels = self.ilabels
89 if len(alive_ilabels) == 0:
90 *_, most_successful_ilabel = self._dead_ilabels
91 raise ParseError("bad input", most_successful_ilabel, value, self.context)
93 ilabel, *rest = alive_ilabels
100 class ParseError(Exception):
101 """Exception to signal the parser is stuck."""
104 self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
107 self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
112 self.context = context
115 class Parser(object):
118 The proper usage sequence is:
120 p = Parser(grammar, [converter]) # create instance
121 p.setup([start]) # prepare for parsing
122 <for each input token>:
123 if p.addtoken(...): # parse a token; may raise ParseError
125 root = p.rootnode # root of abstract syntax tree
127 A Parser instance may be reused by calling setup() repeatedly.
129 A Parser instance contains state pertaining to the current token
130 sequence, and should not be used concurrently by different threads
131 to parse separate token sequences.
133 See driver.py for how to get input tokens by tokenizing a file or
136 Parsing is complete when addtoken() returns True; the root of the
137 abstract syntax tree can then be retrieved from the rootnode
138 instance variable. When a syntax error occurs, addtoken() raises
139 the ParseError exception. There is no error recovery; the parser
140 cannot be used after a syntax error was reported (but it can be
141 reinitialized by calling setup()).
145 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
148 The grammar argument is a grammar.Grammar instance; see the
149 grammar module for more information.
151 The parser is not ready yet for parsing; you must call the
152 setup() method to get it started.
154 The optional convert argument is a function mapping concrete
155 syntax tree nodes to abstract syntax tree nodes. If not
156 given, no conversion is done and the syntax tree produced is
157 the concrete syntax tree. If given, it must be a function of
158 two arguments, the first being the grammar (a grammar.Grammar
159 instance), and the second being the concrete syntax tree node
160 to be converted. The syntax tree is converted from the bottom
163 **post-note: the convert argument is ignored since for Black's
164 usage, convert will always be blib2to3.pytree.convert. Allowing
165 this to be dynamic hurts mypyc's ability to use early binding.
166 These docs are left for historical and informational value.
168 A concrete syntax tree node is a (type, value, context, nodes)
169 tuple, where type is the node type (a token or symbol number),
170 value is None for symbols and a string for tokens, context is
171 None or an opaque value used for error reporting (typically a
172 (lineno, offset) pair), and nodes is a list of children for
173 symbols, and None for tokens.
175 An abstract syntax tree node may be anything; this is entirely
176 up to the converter function.
179 self.grammar = grammar
180 # See note in docstring above. TL;DR this is ignored.
181 self.convert = convert or lam_sub
183 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
184 """Prepare for parsing.
186 This *must* be called before starting to parse.
188 The optional argument is an alternative start symbol; it
189 defaults to the grammar's start symbol.
191 You can use a Parser instance to parse any number of programs;
192 each time you call setup() the parser is reset to an initial
193 state determined by the (implicit or explicit) start symbol.
197 start = self.grammar.start
198 # Each stack entry is a tuple: (dfa, state, node).
199 # A node is a tuple: (type, value, context, children),
200 # where children is a list of nodes or None, and context may be None.
201 newnode: RawNode = (start, None, None, [])
202 stackentry = (self.grammar.dfas[start], 0, newnode)
203 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
204 self.rootnode: Optional[NL] = None
205 self.used_names: Set[str] = set()
208 def addtoken(self, type: int, value: Text, context: Context) -> bool:
209 """Add a token; return True iff this is the end of the program."""
210 # Map from token to label
211 ilabels = self.classify(type, value, context)
212 assert len(ilabels) >= 1
214 # If we have only one state to advance, we'll directly
216 if len(ilabels) == 1:
218 return self._addtoken(ilabel, type, value, context)
220 # If there are multiple states which we can advance (only
221 # happen under soft-keywords), then we will try all of them
222 # in parallel and as soon as one state can reach further than
223 # the rest, we'll choose that one. This is a pretty hacky
224 # and hopefully temporary algorithm.
226 # For a more detailed explanation, check out this post:
227 # https://tree.science/what-the-backtracking.html
229 with self.proxy.release() as proxy:
230 counter, force = 0, False
231 recorder = Recorder(self, ilabels, context)
232 recorder.add_token(type, value, raw=True)
234 next_token_value = value
235 while recorder.determine_route(next_token_value) is None:
236 if not proxy.can_advance(counter):
240 next_token_type, next_token_value, *_ = proxy.eat(counter)
241 if next_token_type == tokenize.OP:
242 next_token_type = grammar.opmap[next_token_value]
244 recorder.add_token(next_token_type, next_token_value)
247 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
248 assert ilabel is not None
250 return self._addtoken(ilabel, type, value, context)
252 def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
253 # Loop until the token is shifted; may raise exceptions
255 dfa, state, node = self.stack[-1]
258 # Look for a state with this label
259 for i, newstate in arcs:
260 t = self.grammar.labels[i][0]
262 # See if it's a symbol and if we're in its first set
263 itsdfa = self.grammar.dfas[t]
264 itsstates, itsfirst = itsdfa
265 if ilabel in itsfirst:
267 self.push(t, itsdfa, newstate, context)
268 break # To continue the outer while loop
271 # Look it up in the list of labels
272 # Shift a token; we're done with it
273 self.shift(type, value, newstate, context)
274 # Pop while we are in an accept-only state
276 while states[state] == [(0, state)]:
281 dfa, state, node = self.stack[-1]
283 # Done with this token
287 if (0, state) in arcs:
288 # An accepting state, pop it and try something else
291 # Done parsing, but another token is input
292 raise ParseError("too much input", type, value, context)
294 # No success finding a transition
295 raise ParseError("bad input", type, value, context)
297 def classify(self, type: int, value: Text, context: Context) -> List[int]:
298 """Turn a token into a label. (Internal)
300 Depending on whether the value is a soft-keyword or not,
301 this function may return multiple labels to choose from."""
302 if type == token.NAME:
303 # Keep a listing of all used names
304 self.used_names.add(value)
305 # Check for reserved words
306 if value in self.grammar.keywords:
307 return [self.grammar.keywords[value]]
308 elif value in self.grammar.soft_keywords:
309 assert type in self.grammar.tokens
311 self.grammar.soft_keywords[value],
312 self.grammar.tokens[type],
315 ilabel = self.grammar.tokens.get(type)
317 raise ParseError("bad token", type, value, context)
320 def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
321 """Shift a token. (Internal)"""
322 dfa, state, node = self.stack[-1]
323 rawnode: RawNode = (type, value, context, None)
324 newnode = convert(self.grammar, rawnode)
325 assert node[-1] is not None
326 node[-1].append(newnode)
327 self.stack[-1] = (dfa, newstate, node)
329 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
330 """Push a nonterminal. (Internal)"""
331 dfa, state, node = self.stack[-1]
332 newnode: RawNode = (type, None, context, [])
333 self.stack[-1] = (dfa, newstate, node)
334 self.stack.append((newdfa, 0, newnode))
336 def pop(self) -> None:
337 """Pop a nonterminal. (Internal)"""
338 popdfa, popstate, popnode = self.stack.pop()
339 newnode = convert(self.grammar, popnode)
341 dfa, state, node = self.stack[-1]
342 assert node[-1] is not None
343 node[-1].append(newnode)
345 self.rootnode = newnode
346 self.rootnode.used_names = self.used_names