All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Parser engine for the grammar tables generated by pgen.
6 The grammar table must be loaded first.
8 See Parser/parser.c in the Python distribution for additional info on
9 how this parsing engine works.
13 from contextlib import contextmanager
16 from . import grammar, token, tokenize
31 from blib2to3.pgen2.grammar import Grammar
32 from blib2to3.pytree import NL, Context, RawNode, Leaf, Node
35 from blib2to3.driver import TokenProxy
38 Results = Dict[Text, NL]
39 Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
40 DFA = List[List[Tuple[int, int]]]
41 DFAS = Tuple[DFA, Dict[int, int]]
44 def lam_sub(grammar: Grammar, node: RawNode) -> NL:
45 assert node[3] is not None
46 return Node(type=node[0], children=node[3], context=node[2])
50 def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
52 self._ilabels = ilabels
53 self.context = context # not really matter
55 self._dead_ilabels: Set[int] = set()
56 self._start_point = copy.deepcopy(self.parser.stack)
57 self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
60 def ilabels(self) -> Set[int]:
61 return self._dead_ilabels.symmetric_difference(self._ilabels)
64 def switch_to(self, ilabel: int) -> Iterator[None]:
65 self.parser.stack = self._points[ilabel]
69 self._dead_ilabels.add(ilabel)
71 self.parser.stack = self._start_point
74 self, tok_type: int, tok_val: Optional[Text], raw: bool = False
76 func: Callable[..., Any]
78 func = self.parser._addtoken
80 func = self.parser.addtoken
82 for ilabel in self.ilabels:
83 with self.switch_to(ilabel):
84 args = [tok_type, tok_val, self.context]
86 args.insert(0, ilabel)
90 self, value: Optional[Text] = None, force: bool = False
92 alive_ilabels = self.ilabels
93 if len(alive_ilabels) == 0:
94 *_, most_successful_ilabel = self._dead_ilabels
95 raise ParseError("bad input", most_successful_ilabel, value, self.context)
97 ilabel, *rest = alive_ilabels
104 class ParseError(Exception):
105 """Exception to signal the parser is stuck."""
108 self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
111 self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
116 self.context = context
119 class Parser(object):
122 The proper usage sequence is:
124 p = Parser(grammar, [converter]) # create instance
125 p.setup([start]) # prepare for parsing
126 <for each input token>:
127 if p.addtoken(...): # parse a token; may raise ParseError
129 root = p.rootnode # root of abstract syntax tree
131 A Parser instance may be reused by calling setup() repeatedly.
133 A Parser instance contains state pertaining to the current token
134 sequence, and should not be used concurrently by different threads
135 to parse separate token sequences.
137 See driver.py for how to get input tokens by tokenizing a file or
140 Parsing is complete when addtoken() returns True; the root of the
141 abstract syntax tree can then be retrieved from the rootnode
142 instance variable. When a syntax error occurs, addtoken() raises
143 the ParseError exception. There is no error recovery; the parser
144 cannot be used after a syntax error was reported (but it can be
145 reinitialized by calling setup()).
149 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
152 The grammar argument is a grammar.Grammar instance; see the
153 grammar module for more information.
155 The parser is not ready yet for parsing; you must call the
156 setup() method to get it started.
158 The optional convert argument is a function mapping concrete
159 syntax tree nodes to abstract syntax tree nodes. If not
160 given, no conversion is done and the syntax tree produced is
161 the concrete syntax tree. If given, it must be a function of
162 two arguments, the first being the grammar (a grammar.Grammar
163 instance), and the second being the concrete syntax tree node
164 to be converted. The syntax tree is converted from the bottom
167 A concrete syntax tree node is a (type, value, context, nodes)
168 tuple, where type is the node type (a token or symbol number),
169 value is None for symbols and a string for tokens, context is
170 None or an opaque value used for error reporting (typically a
171 (lineno, offset) pair), and nodes is a list of children for
172 symbols, and None for tokens.
174 An abstract syntax tree node may be anything; this is entirely
175 up to the converter function.
178 self.grammar = grammar
179 self.convert = convert or lam_sub
181 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
182 """Prepare for parsing.
184 This *must* be called before starting to parse.
186 The optional argument is an alternative start symbol; it
187 defaults to the grammar's start symbol.
189 You can use a Parser instance to parse any number of programs;
190 each time you call setup() the parser is reset to an initial
191 state determined by the (implicit or explicit) start symbol.
195 start = self.grammar.start
196 # Each stack entry is a tuple: (dfa, state, node).
197 # A node is a tuple: (type, value, context, children),
198 # where children is a list of nodes or None, and context may be None.
199 newnode: RawNode = (start, None, None, [])
200 stackentry = (self.grammar.dfas[start], 0, newnode)
201 self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
202 self.rootnode: Optional[NL] = None
203 self.used_names: Set[str] = set()
206 def addtoken(self, type: int, value: Optional[Text], context: Context) -> bool:
207 """Add a token; return True iff this is the end of the program."""
208 # Map from token to label
209 ilabels = self.classify(type, value, context)
210 assert len(ilabels) >= 1
212 # If we have only one state to advance, we'll directly
214 if len(ilabels) == 1:
216 return self._addtoken(ilabel, type, value, context)
218 # If there are multiple states which we can advance (only
219 # happen under soft-keywords), then we will try all of them
220 # in parallel and as soon as one state can reach further than
221 # the rest, we'll choose that one. This is a pretty hacky
222 # and hopefully temporary algorithm.
224 # For a more detailed explanation, check out this post:
225 # https://tree.science/what-the-backtracking.html
227 with self.proxy.release() as proxy:
228 counter, force = 0, False
229 recorder = Recorder(self, ilabels, context)
230 recorder.add_token(type, value, raw=True)
232 next_token_value = value
233 while recorder.determine_route(next_token_value) is None:
234 if not proxy.can_advance(counter):
238 next_token_type, next_token_value, *_ = proxy.eat(counter)
239 if next_token_type == tokenize.OP:
240 next_token_type = grammar.opmap[cast(str, next_token_value)]
242 recorder.add_token(next_token_type, next_token_value)
245 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
246 assert ilabel is not None
248 return self._addtoken(ilabel, type, value, context)
251 self, ilabel: int, type: int, value: Optional[Text], context: Context
253 # Loop until the token is shifted; may raise exceptions
255 dfa, state, node = self.stack[-1]
258 # Look for a state with this label
259 for i, newstate in arcs:
260 t, v = self.grammar.labels[i]
262 # Look it up in the list of labels
264 # Shift a token; we're done with it
265 self.shift(type, value, newstate, context)
266 # Pop while we are in an accept-only state
268 while states[state] == [(0, state)]:
273 dfa, state, node = self.stack[-1]
275 # Done with this token
278 # See if it's a symbol and if we're in its first set
279 itsdfa = self.grammar.dfas[t]
280 itsstates, itsfirst = itsdfa
281 if ilabel in itsfirst:
283 self.push(t, self.grammar.dfas[t], newstate, context)
284 break # To continue the outer while loop
286 if (0, state) in arcs:
287 # An accepting state, pop it and try something else
290 # Done parsing, but another token is input
291 raise ParseError("too much input", type, value, context)
293 # No success finding a transition
294 raise ParseError("bad input", type, value, context)
296 def classify(self, type: int, value: Optional[Text], context: Context) -> List[int]:
297 """Turn a token into a label. (Internal)
299 Depending on whether the value is a soft-keyword or not,
300 this function may return multiple labels to choose from."""
301 if type == token.NAME:
302 # Keep a listing of all used names
303 assert value is not None
304 self.used_names.add(value)
305 # Check for reserved words
306 if value in self.grammar.keywords:
307 return [self.grammar.keywords[value]]
308 elif value in self.grammar.soft_keywords:
309 assert type in self.grammar.tokens
311 self.grammar.soft_keywords[value],
312 self.grammar.tokens[type],
315 ilabel = self.grammar.tokens.get(type)
317 raise ParseError("bad token", type, value, context)
321 self, type: int, value: Optional[Text], newstate: int, context: Context
323 """Shift a token. (Internal)"""
324 dfa, state, node = self.stack[-1]
325 assert value is not None
326 assert context is not None
327 rawnode: RawNode = (type, value, context, None)
328 newnode = self.convert(self.grammar, rawnode)
329 if newnode is not None:
330 assert node[-1] is not None
331 node[-1].append(newnode)
332 self.stack[-1] = (dfa, newstate, node)
334 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
335 """Push a nonterminal. (Internal)"""
336 dfa, state, node = self.stack[-1]
337 newnode: RawNode = (type, None, context, [])
338 self.stack[-1] = (dfa, newstate, node)
339 self.stack.append((newdfa, 0, newnode))
341 def pop(self) -> None:
342 """Pop a nonterminal. (Internal)"""
343 popdfa, popstate, popnode = self.stack.pop()
344 newnode = self.convert(self.grammar, popnode)
345 if newnode is not None:
347 dfa, state, node = self.stack[-1]
348 assert node[-1] is not None
349 node[-1].append(newnode)
351 self.rootnode = newnode
352 self.rootnode.used_names = self.used_names