]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

fuzzer: add special-case for multi-line EOF TokenError (#1991)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 from typing import (
8     Any,
9     Dict,
10     IO,
11     Iterable,
12     Iterator,
13     List,
14     Optional,
15     Text,
16     Tuple,
17     Union,
18     Sequence,
19     NoReturn,
20 )
21 from blib2to3.pgen2 import grammar
22 from blib2to3.pgen2.tokenize import GoodTokenInfo
23 import os
24
25
26 Path = Union[str, "os.PathLike[str]"]
27
28
29 class PgenGrammar(grammar.Grammar):
30     pass
31
32
33 class ParserGenerator(object):
34
35     filename: Path
36     stream: IO[Text]
37     generator: Iterator[GoodTokenInfo]
38     first: Dict[Text, Optional[Dict[Text, int]]]
39
40     def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None:
41         close_stream = None
42         if stream is None:
43             stream = open(filename)
44             close_stream = stream.close
45         self.filename = filename
46         self.stream = stream
47         self.generator = tokenize.generate_tokens(stream.readline)
48         self.gettoken()  # Initialize lookahead
49         self.dfas, self.startsymbol = self.parse()
50         if close_stream is not None:
51             close_stream()
52         self.first = {}  # map from symbol name to set of tokens
53         self.addfirstsets()
54
55     def make_grammar(self) -> PgenGrammar:
56         c = PgenGrammar()
57         names = list(self.dfas.keys())
58         names.sort()
59         names.remove(self.startsymbol)
60         names.insert(0, self.startsymbol)
61         for name in names:
62             i = 256 + len(c.symbol2number)
63             c.symbol2number[name] = i
64             c.number2symbol[i] = name
65         for name in names:
66             dfa = self.dfas[name]
67             states = []
68             for state in dfa:
69                 arcs = []
70                 for label, next in sorted(state.arcs.items()):
71                     arcs.append((self.make_label(c, label), dfa.index(next)))
72                 if state.isfinal:
73                     arcs.append((0, dfa.index(state)))
74                 states.append(arcs)
75             c.states.append(states)
76             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
77         c.start = c.symbol2number[self.startsymbol]
78         return c
79
80     def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]:
81         rawfirst = self.first[name]
82         assert rawfirst is not None
83         first = {}
84         for label in sorted(rawfirst):
85             ilabel = self.make_label(c, label)
86             ##assert ilabel not in first # XXX failed on <> ... !=
87             first[ilabel] = 1
88         return first
89
90     def make_label(self, c: PgenGrammar, label: Text) -> int:
91         # XXX Maybe this should be a method on a subclass of converter?
92         ilabel = len(c.labels)
93         if label[0].isalpha():
94             # Either a symbol name or a named token
95             if label in c.symbol2number:
96                 # A symbol name (a non-terminal)
97                 if label in c.symbol2label:
98                     return c.symbol2label[label]
99                 else:
100                     c.labels.append((c.symbol2number[label], None))
101                     c.symbol2label[label] = ilabel
102                     return ilabel
103             else:
104                 # A named token (NAME, NUMBER, STRING)
105                 itoken = getattr(token, label, None)
106                 assert isinstance(itoken, int), label
107                 assert itoken in token.tok_name, label
108                 if itoken in c.tokens:
109                     return c.tokens[itoken]
110                 else:
111                     c.labels.append((itoken, None))
112                     c.tokens[itoken] = ilabel
113                     return ilabel
114         else:
115             # Either a keyword or an operator
116             assert label[0] in ('"', "'"), label
117             value = eval(label)
118             if value[0].isalpha():
119                 # A keyword
120                 if value in c.keywords:
121                     return c.keywords[value]
122                 else:
123                     c.labels.append((token.NAME, value))
124                     c.keywords[value] = ilabel
125                     return ilabel
126             else:
127                 # An operator (any non-numeric token)
128                 itoken = grammar.opmap[value]  # Fails if unknown token
129                 if itoken in c.tokens:
130                     return c.tokens[itoken]
131                 else:
132                     c.labels.append((itoken, None))
133                     c.tokens[itoken] = ilabel
134                     return ilabel
135
136     def addfirstsets(self) -> None:
137         names = list(self.dfas.keys())
138         names.sort()
139         for name in names:
140             if name not in self.first:
141                 self.calcfirst(name)
142             # print name, self.first[name].keys()
143
144     def calcfirst(self, name: Text) -> None:
145         dfa = self.dfas[name]
146         self.first[name] = None  # dummy to detect left recursion
147         state = dfa[0]
148         totalset: Dict[str, int] = {}
149         overlapcheck = {}
150         for label, next in state.arcs.items():
151             if label in self.dfas:
152                 if label in self.first:
153                     fset = self.first[label]
154                     if fset is None:
155                         raise ValueError("recursion for rule %r" % name)
156                 else:
157                     self.calcfirst(label)
158                     fset = self.first[label]
159                     assert fset is not None
160                 totalset.update(fset)
161                 overlapcheck[label] = fset
162             else:
163                 totalset[label] = 1
164                 overlapcheck[label] = {label: 1}
165         inverse: Dict[str, str] = {}
166         for label, itsfirst in overlapcheck.items():
167             for symbol in itsfirst:
168                 if symbol in inverse:
169                     raise ValueError(
170                         "rule %s is ambiguous; %s is in the first sets of %s as well"
171                         " as %s" % (name, symbol, label, inverse[symbol])
172                     )
173                 inverse[symbol] = label
174         self.first[name] = totalset
175
176     def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]:
177         dfas = {}
178         startsymbol: Optional[str] = None
179         # MSTART: (NEWLINE | RULE)* ENDMARKER
180         while self.type != token.ENDMARKER:
181             while self.type == token.NEWLINE:
182                 self.gettoken()
183             # RULE: NAME ':' RHS NEWLINE
184             name = self.expect(token.NAME)
185             self.expect(token.OP, ":")
186             a, z = self.parse_rhs()
187             self.expect(token.NEWLINE)
188             # self.dump_nfa(name, a, z)
189             dfa = self.make_dfa(a, z)
190             # self.dump_dfa(name, dfa)
191             oldlen = len(dfa)
192             self.simplify_dfa(dfa)
193             newlen = len(dfa)
194             dfas[name] = dfa
195             # print name, oldlen, newlen
196             if startsymbol is None:
197                 startsymbol = name
198         assert startsymbol is not None
199         return dfas, startsymbol
200
201     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
202         # To turn an NFA into a DFA, we define the states of the DFA
203         # to correspond to *sets* of states of the NFA.  Then do some
204         # state reduction.  Let's represent sets as dicts with 1 for
205         # values.
206         assert isinstance(start, NFAState)
207         assert isinstance(finish, NFAState)
208
209         def closure(state: NFAState) -> Dict[NFAState, int]:
210             base: Dict[NFAState, int] = {}
211             addclosure(state, base)
212             return base
213
214         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
215             assert isinstance(state, NFAState)
216             if state in base:
217                 return
218             base[state] = 1
219             for label, next in state.arcs:
220                 if label is None:
221                     addclosure(next, base)
222
223         states = [DFAState(closure(start), finish)]
224         for state in states:  # NB states grows while we're iterating
225             arcs: Dict[str, Dict[NFAState, int]] = {}
226             for nfastate in state.nfaset:
227                 for label, next in nfastate.arcs:
228                     if label is not None:
229                         addclosure(next, arcs.setdefault(label, {}))
230             for label, nfaset in sorted(arcs.items()):
231                 for st in states:
232                     if st.nfaset == nfaset:
233                         break
234                 else:
235                     st = DFAState(nfaset, finish)
236                     states.append(st)
237                 state.addarc(st, label)
238         return states  # List of DFAState instances; first one is start
239
240     def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None:
241         print("Dump of NFA for", name)
242         todo = [start]
243         for i, state in enumerate(todo):
244             print("  State", i, state is finish and "(final)" or "")
245             for label, next in state.arcs:
246                 if next in todo:
247                     j = todo.index(next)
248                 else:
249                     j = len(todo)
250                     todo.append(next)
251                 if label is None:
252                     print("    -> %d" % j)
253                 else:
254                     print("    %s -> %d" % (label, j))
255
256     def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None:
257         print("Dump of DFA for", name)
258         for i, state in enumerate(dfa):
259             print("  State", i, state.isfinal and "(final)" or "")
260             for label, next in sorted(state.arcs.items()):
261                 print("    %s -> %d" % (label, dfa.index(next)))
262
263     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
264         # This is not theoretically optimal, but works well enough.
265         # Algorithm: repeatedly look for two states that have the same
266         # set of arcs (same labels pointing to the same nodes) and
267         # unify them, until things stop changing.
268
269         # dfa is a list of DFAState instances
270         changes = True
271         while changes:
272             changes = False
273             for i, state_i in enumerate(dfa):
274                 for j in range(i + 1, len(dfa)):
275                     state_j = dfa[j]
276                     if state_i == state_j:
277                         # print "  unify", i, j
278                         del dfa[j]
279                         for state in dfa:
280                             state.unifystate(state_j, state_i)
281                         changes = True
282                         break
283
284     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
285         # RHS: ALT ('|' ALT)*
286         a, z = self.parse_alt()
287         if self.value != "|":
288             return a, z
289         else:
290             aa = NFAState()
291             zz = NFAState()
292             aa.addarc(a)
293             z.addarc(zz)
294             while self.value == "|":
295                 self.gettoken()
296                 a, z = self.parse_alt()
297                 aa.addarc(a)
298                 z.addarc(zz)
299             return aa, zz
300
301     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
302         # ALT: ITEM+
303         a, b = self.parse_item()
304         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
305             c, d = self.parse_item()
306             b.addarc(c)
307             b = d
308         return a, b
309
310     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
311         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
312         if self.value == "[":
313             self.gettoken()
314             a, z = self.parse_rhs()
315             self.expect(token.OP, "]")
316             a.addarc(z)
317             return a, z
318         else:
319             a, z = self.parse_atom()
320             value = self.value
321             if value not in ("+", "*"):
322                 return a, z
323             self.gettoken()
324             z.addarc(a)
325             if value == "+":
326                 return a, z
327             else:
328                 return a, a
329
330     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
331         # ATOM: '(' RHS ')' | NAME | STRING
332         if self.value == "(":
333             self.gettoken()
334             a, z = self.parse_rhs()
335             self.expect(token.OP, ")")
336             return a, z
337         elif self.type in (token.NAME, token.STRING):
338             a = NFAState()
339             z = NFAState()
340             a.addarc(z, self.value)
341             self.gettoken()
342             return a, z
343         else:
344             self.raise_error(
345                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
346             )
347             assert False
348
349     def expect(self, type: int, value: Optional[Any] = None) -> Text:
350         if self.type != type or (value is not None and self.value != value):
351             self.raise_error(
352                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
353             )
354         value = self.value
355         self.gettoken()
356         return value
357
358     def gettoken(self) -> None:
359         tup = next(self.generator)
360         while tup[0] in (tokenize.COMMENT, tokenize.NL):
361             tup = next(self.generator)
362         self.type, self.value, self.begin, self.end, self.line = tup
363         # print token.tok_name[self.type], repr(self.value)
364
365     def raise_error(self, msg: str, *args: Any) -> NoReturn:
366         if args:
367             try:
368                 msg = msg % args
369             except:
370                 msg = " ".join([msg] + list(map(str, args)))
371         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
372
373
374 class NFAState(object):
375     arcs: List[Tuple[Optional[Text], "NFAState"]]
376
377     def __init__(self) -> None:
378         self.arcs = []  # list of (label, NFAState) pairs
379
380     def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None:
381         assert label is None or isinstance(label, str)
382         assert isinstance(next, NFAState)
383         self.arcs.append((label, next))
384
385
386 class DFAState(object):
387     nfaset: Dict[NFAState, Any]
388     isfinal: bool
389     arcs: Dict[Text, "DFAState"]
390
391     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
392         assert isinstance(nfaset, dict)
393         assert isinstance(next(iter(nfaset)), NFAState)
394         assert isinstance(final, NFAState)
395         self.nfaset = nfaset
396         self.isfinal = final in nfaset
397         self.arcs = {}  # map from label to DFAState
398
399     def addarc(self, next: "DFAState", label: Text) -> None:
400         assert isinstance(label, str)
401         assert label not in self.arcs
402         assert isinstance(next, DFAState)
403         self.arcs[label] = next
404
405     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
406         for label, next in self.arcs.items():
407             if next is old:
408                 self.arcs[label] = new
409
410     def __eq__(self, other: Any) -> bool:
411         # Equality test -- ignore the nfaset instance variable
412         assert isinstance(other, DFAState)
413         if self.isfinal != other.isfinal:
414             return False
415         # Can't just return self.arcs == other.arcs, because that
416         # would invoke this method recursively, with cycles...
417         if len(self.arcs) != len(other.arcs):
418             return False
419         for label, next in self.arcs.items():
420             if next is not other.arcs.get(label):
421                 return False
422         return True
423
424     __hash__: Any = None  # For Py3 compatibility.
425
426
427 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
428     p = ParserGenerator(filename)
429     return p.make_grammar()