]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Update README.md (#1453)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 from typing import (
8     Any,
9     Dict,
10     IO,
11     Iterable,
12     Iterator,
13     List,
14     Optional,
15     Text,
16     Tuple,
17     Union,
18     Sequence,
19     NoReturn,
20 )
21 from blib2to3.pgen2 import grammar
22 from blib2to3.pgen2.tokenize import GoodTokenInfo
23 import os
24
25
26 Path = Union[str, "os.PathLike[str]"]
27
28
29 class PgenGrammar(grammar.Grammar):
30     pass
31
32
33 class ParserGenerator(object):
34
35     filename: Path
36     stream: IO[Text]
37     generator: Iterator[GoodTokenInfo]
38     first: Dict[Text, Optional[Dict[Text, int]]]
39
40     def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None:
41         close_stream = None
42         if stream is None:
43             stream = open(filename)
44             close_stream = stream.close
45         self.filename = filename
46         self.stream = stream
47         self.generator = tokenize.generate_tokens(stream.readline)
48         self.gettoken()  # Initialize lookahead
49         self.dfas, self.startsymbol = self.parse()
50         if close_stream is not None:
51             close_stream()
52         self.first = {}  # map from symbol name to set of tokens
53         self.addfirstsets()
54
55     def make_grammar(self) -> PgenGrammar:
56         c = PgenGrammar()
57         names = list(self.dfas.keys())
58         names.sort()
59         names.remove(self.startsymbol)
60         names.insert(0, self.startsymbol)
61         for name in names:
62             i = 256 + len(c.symbol2number)
63             c.symbol2number[name] = i
64             c.number2symbol[i] = name
65         for name in names:
66             dfa = self.dfas[name]
67             states = []
68             for state in dfa:
69                 arcs = []
70                 for label, next in sorted(state.arcs.items()):
71                     arcs.append((self.make_label(c, label), dfa.index(next)))
72                 if state.isfinal:
73                     arcs.append((0, dfa.index(state)))
74                 states.append(arcs)
75             c.states.append(states)
76             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
77         c.start = c.symbol2number[self.startsymbol]
78         return c
79
80     def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]:
81         rawfirst = self.first[name]
82         assert rawfirst is not None
83         first = {}
84         for label in sorted(rawfirst):
85             ilabel = self.make_label(c, label)
86             ##assert ilabel not in first # XXX failed on <> ... !=
87             first[ilabel] = 1
88         return first
89
90     def make_label(self, c: PgenGrammar, label: Text) -> int:
91         # XXX Maybe this should be a method on a subclass of converter?
92         ilabel = len(c.labels)
93         if label[0].isalpha():
94             # Either a symbol name or a named token
95             if label in c.symbol2number:
96                 # A symbol name (a non-terminal)
97                 if label in c.symbol2label:
98                     return c.symbol2label[label]
99                 else:
100                     c.labels.append((c.symbol2number[label], None))
101                     c.symbol2label[label] = ilabel
102                     return ilabel
103             else:
104                 # A named token (NAME, NUMBER, STRING)
105                 itoken = getattr(token, label, None)
106                 assert isinstance(itoken, int), label
107                 assert itoken in token.tok_name, label
108                 if itoken in c.tokens:
109                     return c.tokens[itoken]
110                 else:
111                     c.labels.append((itoken, None))
112                     c.tokens[itoken] = ilabel
113                     return ilabel
114         else:
115             # Either a keyword or an operator
116             assert label[0] in ('"', "'"), label
117             value = eval(label)
118             if value[0].isalpha():
119                 # A keyword
120                 if value in c.keywords:
121                     return c.keywords[value]
122                 else:
123                     c.labels.append((token.NAME, value))
124                     c.keywords[value] = ilabel
125                     return ilabel
126             else:
127                 # An operator (any non-numeric token)
128                 itoken = grammar.opmap[value]  # Fails if unknown token
129                 if itoken in c.tokens:
130                     return c.tokens[itoken]
131                 else:
132                     c.labels.append((itoken, None))
133                     c.tokens[itoken] = ilabel
134                     return ilabel
135
136     def addfirstsets(self) -> None:
137         names = list(self.dfas.keys())
138         names.sort()
139         for name in names:
140             if name not in self.first:
141                 self.calcfirst(name)
142             # print name, self.first[name].keys()
143
144     def calcfirst(self, name: Text) -> None:
145         dfa = self.dfas[name]
146         self.first[name] = None  # dummy to detect left recursion
147         state = dfa[0]
148         totalset: Dict[str, int] = {}
149         overlapcheck = {}
150         for label, next in state.arcs.items():
151             if label in self.dfas:
152                 if label in self.first:
153                     fset = self.first[label]
154                     if fset is None:
155                         raise ValueError("recursion for rule %r" % name)
156                 else:
157                     self.calcfirst(label)
158                     fset = self.first[label]
159                     assert fset is not None
160                 totalset.update(fset)
161                 overlapcheck[label] = fset
162             else:
163                 totalset[label] = 1
164                 overlapcheck[label] = {label: 1}
165         inverse: Dict[str, str] = {}
166         for label, itsfirst in overlapcheck.items():
167             for symbol in itsfirst:
168                 if symbol in inverse:
169                     raise ValueError(
170                         "rule %s is ambiguous; %s is in the first sets of %s as well"
171                         " as %s"
172                         % (name, symbol, label, inverse[symbol])
173                     )
174                 inverse[symbol] = label
175         self.first[name] = totalset
176
177     def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]:
178         dfas = {}
179         startsymbol: Optional[str] = None
180         # MSTART: (NEWLINE | RULE)* ENDMARKER
181         while self.type != token.ENDMARKER:
182             while self.type == token.NEWLINE:
183                 self.gettoken()
184             # RULE: NAME ':' RHS NEWLINE
185             name = self.expect(token.NAME)
186             self.expect(token.OP, ":")
187             a, z = self.parse_rhs()
188             self.expect(token.NEWLINE)
189             # self.dump_nfa(name, a, z)
190             dfa = self.make_dfa(a, z)
191             # self.dump_dfa(name, dfa)
192             oldlen = len(dfa)
193             self.simplify_dfa(dfa)
194             newlen = len(dfa)
195             dfas[name] = dfa
196             # print name, oldlen, newlen
197             if startsymbol is None:
198                 startsymbol = name
199         assert startsymbol is not None
200         return dfas, startsymbol
201
202     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
203         # To turn an NFA into a DFA, we define the states of the DFA
204         # to correspond to *sets* of states of the NFA.  Then do some
205         # state reduction.  Let's represent sets as dicts with 1 for
206         # values.
207         assert isinstance(start, NFAState)
208         assert isinstance(finish, NFAState)
209
210         def closure(state: NFAState) -> Dict[NFAState, int]:
211             base: Dict[NFAState, int] = {}
212             addclosure(state, base)
213             return base
214
215         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
216             assert isinstance(state, NFAState)
217             if state in base:
218                 return
219             base[state] = 1
220             for label, next in state.arcs:
221                 if label is None:
222                     addclosure(next, base)
223
224         states = [DFAState(closure(start), finish)]
225         for state in states:  # NB states grows while we're iterating
226             arcs: Dict[str, Dict[NFAState, int]] = {}
227             for nfastate in state.nfaset:
228                 for label, next in nfastate.arcs:
229                     if label is not None:
230                         addclosure(next, arcs.setdefault(label, {}))
231             for label, nfaset in sorted(arcs.items()):
232                 for st in states:
233                     if st.nfaset == nfaset:
234                         break
235                 else:
236                     st = DFAState(nfaset, finish)
237                     states.append(st)
238                 state.addarc(st, label)
239         return states  # List of DFAState instances; first one is start
240
241     def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None:
242         print("Dump of NFA for", name)
243         todo = [start]
244         for i, state in enumerate(todo):
245             print("  State", i, state is finish and "(final)" or "")
246             for label, next in state.arcs:
247                 if next in todo:
248                     j = todo.index(next)
249                 else:
250                     j = len(todo)
251                     todo.append(next)
252                 if label is None:
253                     print("    -> %d" % j)
254                 else:
255                     print("    %s -> %d" % (label, j))
256
257     def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None:
258         print("Dump of DFA for", name)
259         for i, state in enumerate(dfa):
260             print("  State", i, state.isfinal and "(final)" or "")
261             for label, next in sorted(state.arcs.items()):
262                 print("    %s -> %d" % (label, dfa.index(next)))
263
264     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
265         # This is not theoretically optimal, but works well enough.
266         # Algorithm: repeatedly look for two states that have the same
267         # set of arcs (same labels pointing to the same nodes) and
268         # unify them, until things stop changing.
269
270         # dfa is a list of DFAState instances
271         changes = True
272         while changes:
273             changes = False
274             for i, state_i in enumerate(dfa):
275                 for j in range(i + 1, len(dfa)):
276                     state_j = dfa[j]
277                     if state_i == state_j:
278                         # print "  unify", i, j
279                         del dfa[j]
280                         for state in dfa:
281                             state.unifystate(state_j, state_i)
282                         changes = True
283                         break
284
285     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
286         # RHS: ALT ('|' ALT)*
287         a, z = self.parse_alt()
288         if self.value != "|":
289             return a, z
290         else:
291             aa = NFAState()
292             zz = NFAState()
293             aa.addarc(a)
294             z.addarc(zz)
295             while self.value == "|":
296                 self.gettoken()
297                 a, z = self.parse_alt()
298                 aa.addarc(a)
299                 z.addarc(zz)
300             return aa, zz
301
302     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
303         # ALT: ITEM+
304         a, b = self.parse_item()
305         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
306             c, d = self.parse_item()
307             b.addarc(c)
308             b = d
309         return a, b
310
311     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
312         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
313         if self.value == "[":
314             self.gettoken()
315             a, z = self.parse_rhs()
316             self.expect(token.OP, "]")
317             a.addarc(z)
318             return a, z
319         else:
320             a, z = self.parse_atom()
321             value = self.value
322             if value not in ("+", "*"):
323                 return a, z
324             self.gettoken()
325             z.addarc(a)
326             if value == "+":
327                 return a, z
328             else:
329                 return a, a
330
331     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
332         # ATOM: '(' RHS ')' | NAME | STRING
333         if self.value == "(":
334             self.gettoken()
335             a, z = self.parse_rhs()
336             self.expect(token.OP, ")")
337             return a, z
338         elif self.type in (token.NAME, token.STRING):
339             a = NFAState()
340             z = NFAState()
341             a.addarc(z, self.value)
342             self.gettoken()
343             return a, z
344         else:
345             self.raise_error(
346                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
347             )
348             assert False
349
350     def expect(self, type: int, value: Optional[Any] = None) -> Text:
351         if self.type != type or (value is not None and self.value != value):
352             self.raise_error(
353                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
354             )
355         value = self.value
356         self.gettoken()
357         return value
358
359     def gettoken(self) -> None:
360         tup = next(self.generator)
361         while tup[0] in (tokenize.COMMENT, tokenize.NL):
362             tup = next(self.generator)
363         self.type, self.value, self.begin, self.end, self.line = tup
364         # print token.tok_name[self.type], repr(self.value)
365
366     def raise_error(self, msg: str, *args: Any) -> NoReturn:
367         if args:
368             try:
369                 msg = msg % args
370             except:
371                 msg = " ".join([msg] + list(map(str, args)))
372         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
373
374
375 class NFAState(object):
376     arcs: List[Tuple[Optional[Text], "NFAState"]]
377
378     def __init__(self) -> None:
379         self.arcs = []  # list of (label, NFAState) pairs
380
381     def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None:
382         assert label is None or isinstance(label, str)
383         assert isinstance(next, NFAState)
384         self.arcs.append((label, next))
385
386
387 class DFAState(object):
388     nfaset: Dict[NFAState, Any]
389     isfinal: bool
390     arcs: Dict[Text, "DFAState"]
391
392     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
393         assert isinstance(nfaset, dict)
394         assert isinstance(next(iter(nfaset)), NFAState)
395         assert isinstance(final, NFAState)
396         self.nfaset = nfaset
397         self.isfinal = final in nfaset
398         self.arcs = {}  # map from label to DFAState
399
400     def addarc(self, next: "DFAState", label: Text) -> None:
401         assert isinstance(label, str)
402         assert label not in self.arcs
403         assert isinstance(next, DFAState)
404         self.arcs[label] = next
405
406     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
407         for label, next in self.arcs.items():
408             if next is old:
409                 self.arcs[label] = new
410
411     def __eq__(self, other: Any) -> bool:
412         # Equality test -- ignore the nfaset instance variable
413         assert isinstance(other, DFAState)
414         if self.isfinal != other.isfinal:
415             return False
416         # Can't just return self.arcs == other.arcs, because that
417         # would invoke this method recursively, with cycles...
418         if len(self.arcs) != len(other.arcs):
419             return False
420         for label, next in self.arcs.items():
421             if next is not other.arcs.get(label):
422                 return False
423         return True
424
425     __hash__: Any = None  # For Py3 compatibility.
426
427
428 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
429     p = ParserGenerator(filename)
430     return p.make_grammar()