]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

[trivial] Use proper test cases on `unittest` (#2775)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 from typing import (
8     Any,
9     Dict,
10     IO,
11     Iterator,
12     List,
13     Optional,
14     Text,
15     Tuple,
16     Union,
17     Sequence,
18     NoReturn,
19 )
20 from blib2to3.pgen2 import grammar
21 from blib2to3.pgen2.tokenize import GoodTokenInfo
22 import os
23
24
25 Path = Union[str, "os.PathLike[str]"]
26
27
28 class PgenGrammar(grammar.Grammar):
29     pass
30
31
32 class ParserGenerator(object):
33
34     filename: Path
35     stream: IO[Text]
36     generator: Iterator[GoodTokenInfo]
37     first: Dict[Text, Optional[Dict[Text, int]]]
38
39     def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None:
40         close_stream = None
41         if stream is None:
42             stream = open(filename)
43             close_stream = stream.close
44         self.filename = filename
45         self.stream = stream
46         self.generator = tokenize.generate_tokens(stream.readline)
47         self.gettoken()  # Initialize lookahead
48         self.dfas, self.startsymbol = self.parse()
49         if close_stream is not None:
50             close_stream()
51         self.first = {}  # map from symbol name to set of tokens
52         self.addfirstsets()
53
54     def make_grammar(self) -> PgenGrammar:
55         c = PgenGrammar()
56         names = list(self.dfas.keys())
57         names.sort()
58         names.remove(self.startsymbol)
59         names.insert(0, self.startsymbol)
60         for name in names:
61             i = 256 + len(c.symbol2number)
62             c.symbol2number[name] = i
63             c.number2symbol[i] = name
64         for name in names:
65             dfa = self.dfas[name]
66             states = []
67             for state in dfa:
68                 arcs = []
69                 for label, next in sorted(state.arcs.items()):
70                     arcs.append((self.make_label(c, label), dfa.index(next)))
71                 if state.isfinal:
72                     arcs.append((0, dfa.index(state)))
73                 states.append(arcs)
74             c.states.append(states)
75             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
76         c.start = c.symbol2number[self.startsymbol]
77         return c
78
79     def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]:
80         rawfirst = self.first[name]
81         assert rawfirst is not None
82         first = {}
83         for label in sorted(rawfirst):
84             ilabel = self.make_label(c, label)
85             ##assert ilabel not in first # XXX failed on <> ... !=
86             first[ilabel] = 1
87         return first
88
89     def make_label(self, c: PgenGrammar, label: Text) -> int:
90         # XXX Maybe this should be a method on a subclass of converter?
91         ilabel = len(c.labels)
92         if label[0].isalpha():
93             # Either a symbol name or a named token
94             if label in c.symbol2number:
95                 # A symbol name (a non-terminal)
96                 if label in c.symbol2label:
97                     return c.symbol2label[label]
98                 else:
99                     c.labels.append((c.symbol2number[label], None))
100                     c.symbol2label[label] = ilabel
101                     return ilabel
102             else:
103                 # A named token (NAME, NUMBER, STRING)
104                 itoken = getattr(token, label, None)
105                 assert isinstance(itoken, int), label
106                 assert itoken in token.tok_name, label
107                 if itoken in c.tokens:
108                     return c.tokens[itoken]
109                 else:
110                     c.labels.append((itoken, None))
111                     c.tokens[itoken] = ilabel
112                     return ilabel
113         else:
114             # Either a keyword or an operator
115             assert label[0] in ('"', "'"), label
116             value = eval(label)
117             if value[0].isalpha():
118                 if label[0] == '"':
119                     keywords = c.soft_keywords
120                 else:
121                     keywords = c.keywords
122
123                 # A keyword
124                 if value in keywords:
125                     return keywords[value]
126                 else:
127                     c.labels.append((token.NAME, value))
128                     keywords[value] = ilabel
129                     return ilabel
130             else:
131                 # An operator (any non-numeric token)
132                 itoken = grammar.opmap[value]  # Fails if unknown token
133                 if itoken in c.tokens:
134                     return c.tokens[itoken]
135                 else:
136                     c.labels.append((itoken, None))
137                     c.tokens[itoken] = ilabel
138                     return ilabel
139
140     def addfirstsets(self) -> None:
141         names = list(self.dfas.keys())
142         names.sort()
143         for name in names:
144             if name not in self.first:
145                 self.calcfirst(name)
146             # print name, self.first[name].keys()
147
148     def calcfirst(self, name: Text) -> None:
149         dfa = self.dfas[name]
150         self.first[name] = None  # dummy to detect left recursion
151         state = dfa[0]
152         totalset: Dict[str, int] = {}
153         overlapcheck = {}
154         for label, next in state.arcs.items():
155             if label in self.dfas:
156                 if label in self.first:
157                     fset = self.first[label]
158                     if fset is None:
159                         raise ValueError("recursion for rule %r" % name)
160                 else:
161                     self.calcfirst(label)
162                     fset = self.first[label]
163                     assert fset is not None
164                 totalset.update(fset)
165                 overlapcheck[label] = fset
166             else:
167                 totalset[label] = 1
168                 overlapcheck[label] = {label: 1}
169         inverse: Dict[str, str] = {}
170         for label, itsfirst in overlapcheck.items():
171             for symbol in itsfirst:
172                 if symbol in inverse:
173                     raise ValueError(
174                         "rule %s is ambiguous; %s is in the first sets of %s as well"
175                         " as %s" % (name, symbol, label, inverse[symbol])
176                     )
177                 inverse[symbol] = label
178         self.first[name] = totalset
179
180     def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]:
181         dfas = {}
182         startsymbol: Optional[str] = None
183         # MSTART: (NEWLINE | RULE)* ENDMARKER
184         while self.type != token.ENDMARKER:
185             while self.type == token.NEWLINE:
186                 self.gettoken()
187             # RULE: NAME ':' RHS NEWLINE
188             name = self.expect(token.NAME)
189             self.expect(token.OP, ":")
190             a, z = self.parse_rhs()
191             self.expect(token.NEWLINE)
192             # self.dump_nfa(name, a, z)
193             dfa = self.make_dfa(a, z)
194             # self.dump_dfa(name, dfa)
195             oldlen = len(dfa)
196             self.simplify_dfa(dfa)
197             newlen = len(dfa)
198             dfas[name] = dfa
199             # print name, oldlen, newlen
200             if startsymbol is None:
201                 startsymbol = name
202         assert startsymbol is not None
203         return dfas, startsymbol
204
205     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
206         # To turn an NFA into a DFA, we define the states of the DFA
207         # to correspond to *sets* of states of the NFA.  Then do some
208         # state reduction.  Let's represent sets as dicts with 1 for
209         # values.
210         assert isinstance(start, NFAState)
211         assert isinstance(finish, NFAState)
212
213         def closure(state: NFAState) -> Dict[NFAState, int]:
214             base: Dict[NFAState, int] = {}
215             addclosure(state, base)
216             return base
217
218         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
219             assert isinstance(state, NFAState)
220             if state in base:
221                 return
222             base[state] = 1
223             for label, next in state.arcs:
224                 if label is None:
225                     addclosure(next, base)
226
227         states = [DFAState(closure(start), finish)]
228         for state in states:  # NB states grows while we're iterating
229             arcs: Dict[str, Dict[NFAState, int]] = {}
230             for nfastate in state.nfaset:
231                 for label, next in nfastate.arcs:
232                     if label is not None:
233                         addclosure(next, arcs.setdefault(label, {}))
234             for label, nfaset in sorted(arcs.items()):
235                 for st in states:
236                     if st.nfaset == nfaset:
237                         break
238                 else:
239                     st = DFAState(nfaset, finish)
240                     states.append(st)
241                 state.addarc(st, label)
242         return states  # List of DFAState instances; first one is start
243
244     def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None:
245         print("Dump of NFA for", name)
246         todo = [start]
247         for i, state in enumerate(todo):
248             print("  State", i, state is finish and "(final)" or "")
249             for label, next in state.arcs:
250                 if next in todo:
251                     j = todo.index(next)
252                 else:
253                     j = len(todo)
254                     todo.append(next)
255                 if label is None:
256                     print("    -> %d" % j)
257                 else:
258                     print("    %s -> %d" % (label, j))
259
260     def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None:
261         print("Dump of DFA for", name)
262         for i, state in enumerate(dfa):
263             print("  State", i, state.isfinal and "(final)" or "")
264             for label, next in sorted(state.arcs.items()):
265                 print("    %s -> %d" % (label, dfa.index(next)))
266
267     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
268         # This is not theoretically optimal, but works well enough.
269         # Algorithm: repeatedly look for two states that have the same
270         # set of arcs (same labels pointing to the same nodes) and
271         # unify them, until things stop changing.
272
273         # dfa is a list of DFAState instances
274         changes = True
275         while changes:
276             changes = False
277             for i, state_i in enumerate(dfa):
278                 for j in range(i + 1, len(dfa)):
279                     state_j = dfa[j]
280                     if state_i == state_j:
281                         # print "  unify", i, j
282                         del dfa[j]
283                         for state in dfa:
284                             state.unifystate(state_j, state_i)
285                         changes = True
286                         break
287
288     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
289         # RHS: ALT ('|' ALT)*
290         a, z = self.parse_alt()
291         if self.value != "|":
292             return a, z
293         else:
294             aa = NFAState()
295             zz = NFAState()
296             aa.addarc(a)
297             z.addarc(zz)
298             while self.value == "|":
299                 self.gettoken()
300                 a, z = self.parse_alt()
301                 aa.addarc(a)
302                 z.addarc(zz)
303             return aa, zz
304
305     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
306         # ALT: ITEM+
307         a, b = self.parse_item()
308         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
309             c, d = self.parse_item()
310             b.addarc(c)
311             b = d
312         return a, b
313
314     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
315         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
316         if self.value == "[":
317             self.gettoken()
318             a, z = self.parse_rhs()
319             self.expect(token.OP, "]")
320             a.addarc(z)
321             return a, z
322         else:
323             a, z = self.parse_atom()
324             value = self.value
325             if value not in ("+", "*"):
326                 return a, z
327             self.gettoken()
328             z.addarc(a)
329             if value == "+":
330                 return a, z
331             else:
332                 return a, a
333
334     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
335         # ATOM: '(' RHS ')' | NAME | STRING
336         if self.value == "(":
337             self.gettoken()
338             a, z = self.parse_rhs()
339             self.expect(token.OP, ")")
340             return a, z
341         elif self.type in (token.NAME, token.STRING):
342             a = NFAState()
343             z = NFAState()
344             a.addarc(z, self.value)
345             self.gettoken()
346             return a, z
347         else:
348             self.raise_error(
349                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
350             )
351             assert False
352
353     def expect(self, type: int, value: Optional[Any] = None) -> Text:
354         if self.type != type or (value is not None and self.value != value):
355             self.raise_error(
356                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
357             )
358         value = self.value
359         self.gettoken()
360         return value
361
362     def gettoken(self) -> None:
363         tup = next(self.generator)
364         while tup[0] in (tokenize.COMMENT, tokenize.NL):
365             tup = next(self.generator)
366         self.type, self.value, self.begin, self.end, self.line = tup
367         # print token.tok_name[self.type], repr(self.value)
368
369     def raise_error(self, msg: str, *args: Any) -> NoReturn:
370         if args:
371             try:
372                 msg = msg % args
373             except:
374                 msg = " ".join([msg] + list(map(str, args)))
375         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
376
377
378 class NFAState(object):
379     arcs: List[Tuple[Optional[Text], "NFAState"]]
380
381     def __init__(self) -> None:
382         self.arcs = []  # list of (label, NFAState) pairs
383
384     def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None:
385         assert label is None or isinstance(label, str)
386         assert isinstance(next, NFAState)
387         self.arcs.append((label, next))
388
389
390 class DFAState(object):
391     nfaset: Dict[NFAState, Any]
392     isfinal: bool
393     arcs: Dict[Text, "DFAState"]
394
395     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
396         assert isinstance(nfaset, dict)
397         assert isinstance(next(iter(nfaset)), NFAState)
398         assert isinstance(final, NFAState)
399         self.nfaset = nfaset
400         self.isfinal = final in nfaset
401         self.arcs = {}  # map from label to DFAState
402
403     def addarc(self, next: "DFAState", label: Text) -> None:
404         assert isinstance(label, str)
405         assert label not in self.arcs
406         assert isinstance(next, DFAState)
407         self.arcs[label] = next
408
409     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
410         for label, next in self.arcs.items():
411             if next is old:
412                 self.arcs[label] = new
413
414     def __eq__(self, other: Any) -> bool:
415         # Equality test -- ignore the nfaset instance variable
416         assert isinstance(other, DFAState)
417         if self.isfinal != other.isfinal:
418             return False
419         # Can't just return self.arcs == other.arcs, because that
420         # would invoke this method recursively, with cycles...
421         if len(self.arcs) != len(other.arcs):
422             return False
423         for label, next in self.arcs.items():
424             if next is not other.arcs.get(label):
425                 return False
426         return True
427
428     __hash__: Any = None  # For Py3 compatibility.
429
430
431 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
432     p = ParserGenerator(filename)
433     return p.make_grammar()