]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Bump furo from 2023.7.26 to 2023.8.19 in /docs + sphinx to 7.2.3 (#3848)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 import os
5 from typing import (
6     IO,
7     Any,
8     Dict,
9     Iterator,
10     List,
11     NoReturn,
12     Optional,
13     Sequence,
14     Tuple,
15     Union,
16 )
17
18 from blib2to3.pgen2 import grammar, token, tokenize
19 from blib2to3.pgen2.tokenize import GoodTokenInfo
20
21 Path = Union[str, "os.PathLike[str]"]
22
23
24 class PgenGrammar(grammar.Grammar):
25     pass
26
27
28 class ParserGenerator:
29     filename: Path
30     stream: IO[str]
31     generator: Iterator[GoodTokenInfo]
32     first: Dict[str, Optional[Dict[str, int]]]
33
34     def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
35         close_stream = None
36         if stream is None:
37             stream = open(filename, encoding="utf-8")
38             close_stream = stream.close
39         self.filename = filename
40         self.stream = stream
41         self.generator = tokenize.generate_tokens(stream.readline)
42         self.gettoken()  # Initialize lookahead
43         self.dfas, self.startsymbol = self.parse()
44         if close_stream is not None:
45             close_stream()
46         self.first = {}  # map from symbol name to set of tokens
47         self.addfirstsets()
48
49     def make_grammar(self) -> PgenGrammar:
50         c = PgenGrammar()
51         names = list(self.dfas.keys())
52         names.sort()
53         names.remove(self.startsymbol)
54         names.insert(0, self.startsymbol)
55         for name in names:
56             i = 256 + len(c.symbol2number)
57             c.symbol2number[name] = i
58             c.number2symbol[i] = name
59         for name in names:
60             dfa = self.dfas[name]
61             states = []
62             for state in dfa:
63                 arcs = []
64                 for label, next in sorted(state.arcs.items()):
65                     arcs.append((self.make_label(c, label), dfa.index(next)))
66                 if state.isfinal:
67                     arcs.append((0, dfa.index(state)))
68                 states.append(arcs)
69             c.states.append(states)
70             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
71         c.start = c.symbol2number[self.startsymbol]
72         return c
73
74     def make_first(self, c: PgenGrammar, name: str) -> Dict[int, int]:
75         rawfirst = self.first[name]
76         assert rawfirst is not None
77         first = {}
78         for label in sorted(rawfirst):
79             ilabel = self.make_label(c, label)
80             ##assert ilabel not in first # XXX failed on <> ... !=
81             first[ilabel] = 1
82         return first
83
84     def make_label(self, c: PgenGrammar, label: str) -> int:
85         # XXX Maybe this should be a method on a subclass of converter?
86         ilabel = len(c.labels)
87         if label[0].isalpha():
88             # Either a symbol name or a named token
89             if label in c.symbol2number:
90                 # A symbol name (a non-terminal)
91                 if label in c.symbol2label:
92                     return c.symbol2label[label]
93                 else:
94                     c.labels.append((c.symbol2number[label], None))
95                     c.symbol2label[label] = ilabel
96                     return ilabel
97             else:
98                 # A named token (NAME, NUMBER, STRING)
99                 itoken = getattr(token, label, None)
100                 assert isinstance(itoken, int), label
101                 assert itoken in token.tok_name, label
102                 if itoken in c.tokens:
103                     return c.tokens[itoken]
104                 else:
105                     c.labels.append((itoken, None))
106                     c.tokens[itoken] = ilabel
107                     return ilabel
108         else:
109             # Either a keyword or an operator
110             assert label[0] in ('"', "'"), label
111             value = eval(label)
112             if value[0].isalpha():
113                 if label[0] == '"':
114                     keywords = c.soft_keywords
115                 else:
116                     keywords = c.keywords
117
118                 # A keyword
119                 if value in keywords:
120                     return keywords[value]
121                 else:
122                     c.labels.append((token.NAME, value))
123                     keywords[value] = ilabel
124                     return ilabel
125             else:
126                 # An operator (any non-numeric token)
127                 itoken = grammar.opmap[value]  # Fails if unknown token
128                 if itoken in c.tokens:
129                     return c.tokens[itoken]
130                 else:
131                     c.labels.append((itoken, None))
132                     c.tokens[itoken] = ilabel
133                     return ilabel
134
135     def addfirstsets(self) -> None:
136         names = list(self.dfas.keys())
137         names.sort()
138         for name in names:
139             if name not in self.first:
140                 self.calcfirst(name)
141             # print name, self.first[name].keys()
142
143     def calcfirst(self, name: str) -> None:
144         dfa = self.dfas[name]
145         self.first[name] = None  # dummy to detect left recursion
146         state = dfa[0]
147         totalset: Dict[str, int] = {}
148         overlapcheck = {}
149         for label in state.arcs:
150             if label in self.dfas:
151                 if label in self.first:
152                     fset = self.first[label]
153                     if fset is None:
154                         raise ValueError("recursion for rule %r" % name)
155                 else:
156                     self.calcfirst(label)
157                     fset = self.first[label]
158                     assert fset is not None
159                 totalset.update(fset)
160                 overlapcheck[label] = fset
161             else:
162                 totalset[label] = 1
163                 overlapcheck[label] = {label: 1}
164         inverse: Dict[str, str] = {}
165         for label, itsfirst in overlapcheck.items():
166             for symbol in itsfirst:
167                 if symbol in inverse:
168                     raise ValueError(
169                         "rule %s is ambiguous; %s is in the first sets of %s as well"
170                         " as %s" % (name, symbol, label, inverse[symbol])
171                     )
172                 inverse[symbol] = label
173         self.first[name] = totalset
174
175     def parse(self) -> Tuple[Dict[str, List["DFAState"]], str]:
176         dfas = {}
177         startsymbol: Optional[str] = None
178         # MSTART: (NEWLINE | RULE)* ENDMARKER
179         while self.type != token.ENDMARKER:
180             while self.type == token.NEWLINE:
181                 self.gettoken()
182             # RULE: NAME ':' RHS NEWLINE
183             name = self.expect(token.NAME)
184             self.expect(token.OP, ":")
185             a, z = self.parse_rhs()
186             self.expect(token.NEWLINE)
187             # self.dump_nfa(name, a, z)
188             dfa = self.make_dfa(a, z)
189             # self.dump_dfa(name, dfa)
190             # oldlen = len(dfa)
191             self.simplify_dfa(dfa)
192             # newlen = len(dfa)
193             dfas[name] = dfa
194             # print name, oldlen, newlen
195             if startsymbol is None:
196                 startsymbol = name
197         assert startsymbol is not None
198         return dfas, startsymbol
199
200     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
201         # To turn an NFA into a DFA, we define the states of the DFA
202         # to correspond to *sets* of states of the NFA.  Then do some
203         # state reduction.  Let's represent sets as dicts with 1 for
204         # values.
205         assert isinstance(start, NFAState)
206         assert isinstance(finish, NFAState)
207
208         def closure(state: NFAState) -> Dict[NFAState, int]:
209             base: Dict[NFAState, int] = {}
210             addclosure(state, base)
211             return base
212
213         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
214             assert isinstance(state, NFAState)
215             if state in base:
216                 return
217             base[state] = 1
218             for label, next in state.arcs:
219                 if label is None:
220                     addclosure(next, base)
221
222         states = [DFAState(closure(start), finish)]
223         for state in states:  # NB states grows while we're iterating
224             arcs: Dict[str, Dict[NFAState, int]] = {}
225             for nfastate in state.nfaset:
226                 for label, next in nfastate.arcs:
227                     if label is not None:
228                         addclosure(next, arcs.setdefault(label, {}))
229             for label, nfaset in sorted(arcs.items()):
230                 for st in states:
231                     if st.nfaset == nfaset:
232                         break
233                 else:
234                     st = DFAState(nfaset, finish)
235                     states.append(st)
236                 state.addarc(st, label)
237         return states  # List of DFAState instances; first one is start
238
239     def dump_nfa(self, name: str, start: "NFAState", finish: "NFAState") -> None:
240         print("Dump of NFA for", name)
241         todo = [start]
242         for i, state in enumerate(todo):
243             print("  State", i, state is finish and "(final)" or "")
244             for label, next in state.arcs:
245                 if next in todo:
246                     j = todo.index(next)
247                 else:
248                     j = len(todo)
249                     todo.append(next)
250                 if label is None:
251                     print("    -> %d" % j)
252                 else:
253                     print("    %s -> %d" % (label, j))
254
255     def dump_dfa(self, name: str, dfa: Sequence["DFAState"]) -> None:
256         print("Dump of DFA for", name)
257         for i, state in enumerate(dfa):
258             print("  State", i, state.isfinal and "(final)" or "")
259             for label, next in sorted(state.arcs.items()):
260                 print("    %s -> %d" % (label, dfa.index(next)))
261
262     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
263         # This is not theoretically optimal, but works well enough.
264         # Algorithm: repeatedly look for two states that have the same
265         # set of arcs (same labels pointing to the same nodes) and
266         # unify them, until things stop changing.
267
268         # dfa is a list of DFAState instances
269         changes = True
270         while changes:
271             changes = False
272             for i, state_i in enumerate(dfa):
273                 for j in range(i + 1, len(dfa)):
274                     state_j = dfa[j]
275                     if state_i == state_j:
276                         # print "  unify", i, j
277                         del dfa[j]
278                         for state in dfa:
279                             state.unifystate(state_j, state_i)
280                         changes = True
281                         break
282
283     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
284         # RHS: ALT ('|' ALT)*
285         a, z = self.parse_alt()
286         if self.value != "|":
287             return a, z
288         else:
289             aa = NFAState()
290             zz = NFAState()
291             aa.addarc(a)
292             z.addarc(zz)
293             while self.value == "|":
294                 self.gettoken()
295                 a, z = self.parse_alt()
296                 aa.addarc(a)
297                 z.addarc(zz)
298             return aa, zz
299
300     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
301         # ALT: ITEM+
302         a, b = self.parse_item()
303         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
304             c, d = self.parse_item()
305             b.addarc(c)
306             b = d
307         return a, b
308
309     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
310         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
311         if self.value == "[":
312             self.gettoken()
313             a, z = self.parse_rhs()
314             self.expect(token.OP, "]")
315             a.addarc(z)
316             return a, z
317         else:
318             a, z = self.parse_atom()
319             value = self.value
320             if value not in ("+", "*"):
321                 return a, z
322             self.gettoken()
323             z.addarc(a)
324             if value == "+":
325                 return a, z
326             else:
327                 return a, a
328
329     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
330         # ATOM: '(' RHS ')' | NAME | STRING
331         if self.value == "(":
332             self.gettoken()
333             a, z = self.parse_rhs()
334             self.expect(token.OP, ")")
335             return a, z
336         elif self.type in (token.NAME, token.STRING):
337             a = NFAState()
338             z = NFAState()
339             a.addarc(z, self.value)
340             self.gettoken()
341             return a, z
342         else:
343             self.raise_error(
344                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
345             )
346             raise AssertionError
347
348     def expect(self, type: int, value: Optional[Any] = None) -> str:
349         if self.type != type or (value is not None and self.value != value):
350             self.raise_error(
351                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
352             )
353         value = self.value
354         self.gettoken()
355         return value
356
357     def gettoken(self) -> None:
358         tup = next(self.generator)
359         while tup[0] in (tokenize.COMMENT, tokenize.NL):
360             tup = next(self.generator)
361         self.type, self.value, self.begin, self.end, self.line = tup
362         # print token.tok_name[self.type], repr(self.value)
363
364     def raise_error(self, msg: str, *args: Any) -> NoReturn:
365         if args:
366             try:
367                 msg = msg % args
368             except Exception:
369                 msg = " ".join([msg] + list(map(str, args)))
370         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
371
372
373 class NFAState:
374     arcs: List[Tuple[Optional[str], "NFAState"]]
375
376     def __init__(self) -> None:
377         self.arcs = []  # list of (label, NFAState) pairs
378
379     def addarc(self, next: "NFAState", label: Optional[str] = None) -> None:
380         assert label is None or isinstance(label, str)
381         assert isinstance(next, NFAState)
382         self.arcs.append((label, next))
383
384
385 class DFAState:
386     nfaset: Dict[NFAState, Any]
387     isfinal: bool
388     arcs: Dict[str, "DFAState"]
389
390     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
391         assert isinstance(nfaset, dict)
392         assert isinstance(next(iter(nfaset)), NFAState)
393         assert isinstance(final, NFAState)
394         self.nfaset = nfaset
395         self.isfinal = final in nfaset
396         self.arcs = {}  # map from label to DFAState
397
398     def addarc(self, next: "DFAState", label: str) -> None:
399         assert isinstance(label, str)
400         assert label not in self.arcs
401         assert isinstance(next, DFAState)
402         self.arcs[label] = next
403
404     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
405         for label, next in self.arcs.items():
406             if next is old:
407                 self.arcs[label] = new
408
409     def __eq__(self, other: Any) -> bool:
410         # Equality test -- ignore the nfaset instance variable
411         assert isinstance(other, DFAState)
412         if self.isfinal != other.isfinal:
413             return False
414         # Can't just return self.arcs == other.arcs, because that
415         # would invoke this method recursively, with cycles...
416         if len(self.arcs) != len(other.arcs):
417             return False
418         for label, next in self.arcs.items():
419             if next is not other.arcs.get(label):
420                 return False
421         return True
422
423     __hash__: Any = None  # For Py3 compatibility.
424
425
426 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
427     p = ParserGenerator(filename)
428     return p.make_grammar()