]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Avoid EncodingWarning in blib2to3 (#3696)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 from typing import (
8     Any,
9     Dict,
10     IO,
11     Iterator,
12     List,
13     Optional,
14     Text,
15     Tuple,
16     Union,
17     Sequence,
18     NoReturn,
19 )
20 from blib2to3.pgen2 import grammar
21 from blib2to3.pgen2.tokenize import GoodTokenInfo
22 import os
23
24
25 Path = Union[str, "os.PathLike[str]"]
26
27
28 class PgenGrammar(grammar.Grammar):
29     pass
30
31
32 class ParserGenerator(object):
33     filename: Path
34     stream: IO[Text]
35     generator: Iterator[GoodTokenInfo]
36     first: Dict[Text, Optional[Dict[Text, int]]]
37
38     def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None:
39         close_stream = None
40         if stream is None:
41             stream = open(filename, encoding="utf-8")
42             close_stream = stream.close
43         self.filename = filename
44         self.stream = stream
45         self.generator = tokenize.generate_tokens(stream.readline)
46         self.gettoken()  # Initialize lookahead
47         self.dfas, self.startsymbol = self.parse()
48         if close_stream is not None:
49             close_stream()
50         self.first = {}  # map from symbol name to set of tokens
51         self.addfirstsets()
52
53     def make_grammar(self) -> PgenGrammar:
54         c = PgenGrammar()
55         names = list(self.dfas.keys())
56         names.sort()
57         names.remove(self.startsymbol)
58         names.insert(0, self.startsymbol)
59         for name in names:
60             i = 256 + len(c.symbol2number)
61             c.symbol2number[name] = i
62             c.number2symbol[i] = name
63         for name in names:
64             dfa = self.dfas[name]
65             states = []
66             for state in dfa:
67                 arcs = []
68                 for label, next in sorted(state.arcs.items()):
69                     arcs.append((self.make_label(c, label), dfa.index(next)))
70                 if state.isfinal:
71                     arcs.append((0, dfa.index(state)))
72                 states.append(arcs)
73             c.states.append(states)
74             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
75         c.start = c.symbol2number[self.startsymbol]
76         return c
77
78     def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]:
79         rawfirst = self.first[name]
80         assert rawfirst is not None
81         first = {}
82         for label in sorted(rawfirst):
83             ilabel = self.make_label(c, label)
84             ##assert ilabel not in first # XXX failed on <> ... !=
85             first[ilabel] = 1
86         return first
87
88     def make_label(self, c: PgenGrammar, label: Text) -> int:
89         # XXX Maybe this should be a method on a subclass of converter?
90         ilabel = len(c.labels)
91         if label[0].isalpha():
92             # Either a symbol name or a named token
93             if label in c.symbol2number:
94                 # A symbol name (a non-terminal)
95                 if label in c.symbol2label:
96                     return c.symbol2label[label]
97                 else:
98                     c.labels.append((c.symbol2number[label], None))
99                     c.symbol2label[label] = ilabel
100                     return ilabel
101             else:
102                 # A named token (NAME, NUMBER, STRING)
103                 itoken = getattr(token, label, None)
104                 assert isinstance(itoken, int), label
105                 assert itoken in token.tok_name, label
106                 if itoken in c.tokens:
107                     return c.tokens[itoken]
108                 else:
109                     c.labels.append((itoken, None))
110                     c.tokens[itoken] = ilabel
111                     return ilabel
112         else:
113             # Either a keyword or an operator
114             assert label[0] in ('"', "'"), label
115             value = eval(label)
116             if value[0].isalpha():
117                 if label[0] == '"':
118                     keywords = c.soft_keywords
119                 else:
120                     keywords = c.keywords
121
122                 # A keyword
123                 if value in keywords:
124                     return keywords[value]
125                 else:
126                     c.labels.append((token.NAME, value))
127                     keywords[value] = ilabel
128                     return ilabel
129             else:
130                 # An operator (any non-numeric token)
131                 itoken = grammar.opmap[value]  # Fails if unknown token
132                 if itoken in c.tokens:
133                     return c.tokens[itoken]
134                 else:
135                     c.labels.append((itoken, None))
136                     c.tokens[itoken] = ilabel
137                     return ilabel
138
139     def addfirstsets(self) -> None:
140         names = list(self.dfas.keys())
141         names.sort()
142         for name in names:
143             if name not in self.first:
144                 self.calcfirst(name)
145             # print name, self.first[name].keys()
146
147     def calcfirst(self, name: Text) -> None:
148         dfa = self.dfas[name]
149         self.first[name] = None  # dummy to detect left recursion
150         state = dfa[0]
151         totalset: Dict[str, int] = {}
152         overlapcheck = {}
153         for label, next in state.arcs.items():
154             if label in self.dfas:
155                 if label in self.first:
156                     fset = self.first[label]
157                     if fset is None:
158                         raise ValueError("recursion for rule %r" % name)
159                 else:
160                     self.calcfirst(label)
161                     fset = self.first[label]
162                     assert fset is not None
163                 totalset.update(fset)
164                 overlapcheck[label] = fset
165             else:
166                 totalset[label] = 1
167                 overlapcheck[label] = {label: 1}
168         inverse: Dict[str, str] = {}
169         for label, itsfirst in overlapcheck.items():
170             for symbol in itsfirst:
171                 if symbol in inverse:
172                     raise ValueError(
173                         "rule %s is ambiguous; %s is in the first sets of %s as well"
174                         " as %s" % (name, symbol, label, inverse[symbol])
175                     )
176                 inverse[symbol] = label
177         self.first[name] = totalset
178
179     def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]:
180         dfas = {}
181         startsymbol: Optional[str] = None
182         # MSTART: (NEWLINE | RULE)* ENDMARKER
183         while self.type != token.ENDMARKER:
184             while self.type == token.NEWLINE:
185                 self.gettoken()
186             # RULE: NAME ':' RHS NEWLINE
187             name = self.expect(token.NAME)
188             self.expect(token.OP, ":")
189             a, z = self.parse_rhs()
190             self.expect(token.NEWLINE)
191             # self.dump_nfa(name, a, z)
192             dfa = self.make_dfa(a, z)
193             # self.dump_dfa(name, dfa)
194             oldlen = len(dfa)
195             self.simplify_dfa(dfa)
196             newlen = len(dfa)
197             dfas[name] = dfa
198             # print name, oldlen, newlen
199             if startsymbol is None:
200                 startsymbol = name
201         assert startsymbol is not None
202         return dfas, startsymbol
203
204     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
205         # To turn an NFA into a DFA, we define the states of the DFA
206         # to correspond to *sets* of states of the NFA.  Then do some
207         # state reduction.  Let's represent sets as dicts with 1 for
208         # values.
209         assert isinstance(start, NFAState)
210         assert isinstance(finish, NFAState)
211
212         def closure(state: NFAState) -> Dict[NFAState, int]:
213             base: Dict[NFAState, int] = {}
214             addclosure(state, base)
215             return base
216
217         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
218             assert isinstance(state, NFAState)
219             if state in base:
220                 return
221             base[state] = 1
222             for label, next in state.arcs:
223                 if label is None:
224                     addclosure(next, base)
225
226         states = [DFAState(closure(start), finish)]
227         for state in states:  # NB states grows while we're iterating
228             arcs: Dict[str, Dict[NFAState, int]] = {}
229             for nfastate in state.nfaset:
230                 for label, next in nfastate.arcs:
231                     if label is not None:
232                         addclosure(next, arcs.setdefault(label, {}))
233             for label, nfaset in sorted(arcs.items()):
234                 for st in states:
235                     if st.nfaset == nfaset:
236                         break
237                 else:
238                     st = DFAState(nfaset, finish)
239                     states.append(st)
240                 state.addarc(st, label)
241         return states  # List of DFAState instances; first one is start
242
243     def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None:
244         print("Dump of NFA for", name)
245         todo = [start]
246         for i, state in enumerate(todo):
247             print("  State", i, state is finish and "(final)" or "")
248             for label, next in state.arcs:
249                 if next in todo:
250                     j = todo.index(next)
251                 else:
252                     j = len(todo)
253                     todo.append(next)
254                 if label is None:
255                     print("    -> %d" % j)
256                 else:
257                     print("    %s -> %d" % (label, j))
258
259     def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None:
260         print("Dump of DFA for", name)
261         for i, state in enumerate(dfa):
262             print("  State", i, state.isfinal and "(final)" or "")
263             for label, next in sorted(state.arcs.items()):
264                 print("    %s -> %d" % (label, dfa.index(next)))
265
266     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
267         # This is not theoretically optimal, but works well enough.
268         # Algorithm: repeatedly look for two states that have the same
269         # set of arcs (same labels pointing to the same nodes) and
270         # unify them, until things stop changing.
271
272         # dfa is a list of DFAState instances
273         changes = True
274         while changes:
275             changes = False
276             for i, state_i in enumerate(dfa):
277                 for j in range(i + 1, len(dfa)):
278                     state_j = dfa[j]
279                     if state_i == state_j:
280                         # print "  unify", i, j
281                         del dfa[j]
282                         for state in dfa:
283                             state.unifystate(state_j, state_i)
284                         changes = True
285                         break
286
287     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
288         # RHS: ALT ('|' ALT)*
289         a, z = self.parse_alt()
290         if self.value != "|":
291             return a, z
292         else:
293             aa = NFAState()
294             zz = NFAState()
295             aa.addarc(a)
296             z.addarc(zz)
297             while self.value == "|":
298                 self.gettoken()
299                 a, z = self.parse_alt()
300                 aa.addarc(a)
301                 z.addarc(zz)
302             return aa, zz
303
304     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
305         # ALT: ITEM+
306         a, b = self.parse_item()
307         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
308             c, d = self.parse_item()
309             b.addarc(c)
310             b = d
311         return a, b
312
313     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
314         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
315         if self.value == "[":
316             self.gettoken()
317             a, z = self.parse_rhs()
318             self.expect(token.OP, "]")
319             a.addarc(z)
320             return a, z
321         else:
322             a, z = self.parse_atom()
323             value = self.value
324             if value not in ("+", "*"):
325                 return a, z
326             self.gettoken()
327             z.addarc(a)
328             if value == "+":
329                 return a, z
330             else:
331                 return a, a
332
333     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
334         # ATOM: '(' RHS ')' | NAME | STRING
335         if self.value == "(":
336             self.gettoken()
337             a, z = self.parse_rhs()
338             self.expect(token.OP, ")")
339             return a, z
340         elif self.type in (token.NAME, token.STRING):
341             a = NFAState()
342             z = NFAState()
343             a.addarc(z, self.value)
344             self.gettoken()
345             return a, z
346         else:
347             self.raise_error(
348                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
349             )
350             assert False
351
352     def expect(self, type: int, value: Optional[Any] = None) -> Text:
353         if self.type != type or (value is not None and self.value != value):
354             self.raise_error(
355                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
356             )
357         value = self.value
358         self.gettoken()
359         return value
360
361     def gettoken(self) -> None:
362         tup = next(self.generator)
363         while tup[0] in (tokenize.COMMENT, tokenize.NL):
364             tup = next(self.generator)
365         self.type, self.value, self.begin, self.end, self.line = tup
366         # print token.tok_name[self.type], repr(self.value)
367
368     def raise_error(self, msg: str, *args: Any) -> NoReturn:
369         if args:
370             try:
371                 msg = msg % args
372             except:
373                 msg = " ".join([msg] + list(map(str, args)))
374         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
375
376
377 class NFAState(object):
378     arcs: List[Tuple[Optional[Text], "NFAState"]]
379
380     def __init__(self) -> None:
381         self.arcs = []  # list of (label, NFAState) pairs
382
383     def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None:
384         assert label is None or isinstance(label, str)
385         assert isinstance(next, NFAState)
386         self.arcs.append((label, next))
387
388
389 class DFAState(object):
390     nfaset: Dict[NFAState, Any]
391     isfinal: bool
392     arcs: Dict[Text, "DFAState"]
393
394     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
395         assert isinstance(nfaset, dict)
396         assert isinstance(next(iter(nfaset)), NFAState)
397         assert isinstance(final, NFAState)
398         self.nfaset = nfaset
399         self.isfinal = final in nfaset
400         self.arcs = {}  # map from label to DFAState
401
402     def addarc(self, next: "DFAState", label: Text) -> None:
403         assert isinstance(label, str)
404         assert label not in self.arcs
405         assert isinstance(next, DFAState)
406         self.arcs[label] = next
407
408     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
409         for label, next in self.arcs.items():
410             if next is old:
411                 self.arcs[label] = new
412
413     def __eq__(self, other: Any) -> bool:
414         # Equality test -- ignore the nfaset instance variable
415         assert isinstance(other, DFAState)
416         if self.isfinal != other.isfinal:
417             return False
418         # Can't just return self.arcs == other.arcs, because that
419         # would invoke this method recursively, with cycles...
420         if len(self.arcs) != len(other.arcs):
421             return False
422         for label, next in self.arcs.items():
423             if next is not other.arcs.get(label):
424                 return False
425         return True
426
427     __hash__: Any = None  # For Py3 compatibility.
428
429
430 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
431     p = ParserGenerator(filename)
432     return p.make_grammar()