]> git.madduck.net Git - etc/vim.git/blob - src/blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Improve performance by skipping unnecessary normalisation (#3751)
[etc/vim.git] / src / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 from typing import (
8     Any,
9     Dict,
10     IO,
11     Iterator,
12     List,
13     Optional,
14     Tuple,
15     Union,
16     Sequence,
17     NoReturn,
18 )
19 from blib2to3.pgen2 import grammar
20 from blib2to3.pgen2.tokenize import GoodTokenInfo
21 import os
22
23
24 Path = Union[str, "os.PathLike[str]"]
25
26
27 class PgenGrammar(grammar.Grammar):
28     pass
29
30
31 class ParserGenerator:
32     filename: Path
33     stream: IO[str]
34     generator: Iterator[GoodTokenInfo]
35     first: Dict[str, Optional[Dict[str, int]]]
36
37     def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
38         close_stream = None
39         if stream is None:
40             stream = open(filename, encoding="utf-8")
41             close_stream = stream.close
42         self.filename = filename
43         self.stream = stream
44         self.generator = tokenize.generate_tokens(stream.readline)
45         self.gettoken()  # Initialize lookahead
46         self.dfas, self.startsymbol = self.parse()
47         if close_stream is not None:
48             close_stream()
49         self.first = {}  # map from symbol name to set of tokens
50         self.addfirstsets()
51
52     def make_grammar(self) -> PgenGrammar:
53         c = PgenGrammar()
54         names = list(self.dfas.keys())
55         names.sort()
56         names.remove(self.startsymbol)
57         names.insert(0, self.startsymbol)
58         for name in names:
59             i = 256 + len(c.symbol2number)
60             c.symbol2number[name] = i
61             c.number2symbol[i] = name
62         for name in names:
63             dfa = self.dfas[name]
64             states = []
65             for state in dfa:
66                 arcs = []
67                 for label, next in sorted(state.arcs.items()):
68                     arcs.append((self.make_label(c, label), dfa.index(next)))
69                 if state.isfinal:
70                     arcs.append((0, dfa.index(state)))
71                 states.append(arcs)
72             c.states.append(states)
73             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
74         c.start = c.symbol2number[self.startsymbol]
75         return c
76
77     def make_first(self, c: PgenGrammar, name: str) -> Dict[int, int]:
78         rawfirst = self.first[name]
79         assert rawfirst is not None
80         first = {}
81         for label in sorted(rawfirst):
82             ilabel = self.make_label(c, label)
83             ##assert ilabel not in first # XXX failed on <> ... !=
84             first[ilabel] = 1
85         return first
86
87     def make_label(self, c: PgenGrammar, label: str) -> int:
88         # XXX Maybe this should be a method on a subclass of converter?
89         ilabel = len(c.labels)
90         if label[0].isalpha():
91             # Either a symbol name or a named token
92             if label in c.symbol2number:
93                 # A symbol name (a non-terminal)
94                 if label in c.symbol2label:
95                     return c.symbol2label[label]
96                 else:
97                     c.labels.append((c.symbol2number[label], None))
98                     c.symbol2label[label] = ilabel
99                     return ilabel
100             else:
101                 # A named token (NAME, NUMBER, STRING)
102                 itoken = getattr(token, label, None)
103                 assert isinstance(itoken, int), label
104                 assert itoken in token.tok_name, label
105                 if itoken in c.tokens:
106                     return c.tokens[itoken]
107                 else:
108                     c.labels.append((itoken, None))
109                     c.tokens[itoken] = ilabel
110                     return ilabel
111         else:
112             # Either a keyword or an operator
113             assert label[0] in ('"', "'"), label
114             value = eval(label)
115             if value[0].isalpha():
116                 if label[0] == '"':
117                     keywords = c.soft_keywords
118                 else:
119                     keywords = c.keywords
120
121                 # A keyword
122                 if value in keywords:
123                     return keywords[value]
124                 else:
125                     c.labels.append((token.NAME, value))
126                     keywords[value] = ilabel
127                     return ilabel
128             else:
129                 # An operator (any non-numeric token)
130                 itoken = grammar.opmap[value]  # Fails if unknown token
131                 if itoken in c.tokens:
132                     return c.tokens[itoken]
133                 else:
134                     c.labels.append((itoken, None))
135                     c.tokens[itoken] = ilabel
136                     return ilabel
137
138     def addfirstsets(self) -> None:
139         names = list(self.dfas.keys())
140         names.sort()
141         for name in names:
142             if name not in self.first:
143                 self.calcfirst(name)
144             # print name, self.first[name].keys()
145
146     def calcfirst(self, name: str) -> None:
147         dfa = self.dfas[name]
148         self.first[name] = None  # dummy to detect left recursion
149         state = dfa[0]
150         totalset: Dict[str, int] = {}
151         overlapcheck = {}
152         for label, next in state.arcs.items():
153             if label in self.dfas:
154                 if label in self.first:
155                     fset = self.first[label]
156                     if fset is None:
157                         raise ValueError("recursion for rule %r" % name)
158                 else:
159                     self.calcfirst(label)
160                     fset = self.first[label]
161                     assert fset is not None
162                 totalset.update(fset)
163                 overlapcheck[label] = fset
164             else:
165                 totalset[label] = 1
166                 overlapcheck[label] = {label: 1}
167         inverse: Dict[str, str] = {}
168         for label, itsfirst in overlapcheck.items():
169             for symbol in itsfirst:
170                 if symbol in inverse:
171                     raise ValueError(
172                         "rule %s is ambiguous; %s is in the first sets of %s as well"
173                         " as %s" % (name, symbol, label, inverse[symbol])
174                     )
175                 inverse[symbol] = label
176         self.first[name] = totalset
177
178     def parse(self) -> Tuple[Dict[str, List["DFAState"]], str]:
179         dfas = {}
180         startsymbol: Optional[str] = None
181         # MSTART: (NEWLINE | RULE)* ENDMARKER
182         while self.type != token.ENDMARKER:
183             while self.type == token.NEWLINE:
184                 self.gettoken()
185             # RULE: NAME ':' RHS NEWLINE
186             name = self.expect(token.NAME)
187             self.expect(token.OP, ":")
188             a, z = self.parse_rhs()
189             self.expect(token.NEWLINE)
190             # self.dump_nfa(name, a, z)
191             dfa = self.make_dfa(a, z)
192             # self.dump_dfa(name, dfa)
193             oldlen = len(dfa)
194             self.simplify_dfa(dfa)
195             newlen = len(dfa)
196             dfas[name] = dfa
197             # print name, oldlen, newlen
198             if startsymbol is None:
199                 startsymbol = name
200         assert startsymbol is not None
201         return dfas, startsymbol
202
203     def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
204         # To turn an NFA into a DFA, we define the states of the DFA
205         # to correspond to *sets* of states of the NFA.  Then do some
206         # state reduction.  Let's represent sets as dicts with 1 for
207         # values.
208         assert isinstance(start, NFAState)
209         assert isinstance(finish, NFAState)
210
211         def closure(state: NFAState) -> Dict[NFAState, int]:
212             base: Dict[NFAState, int] = {}
213             addclosure(state, base)
214             return base
215
216         def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
217             assert isinstance(state, NFAState)
218             if state in base:
219                 return
220             base[state] = 1
221             for label, next in state.arcs:
222                 if label is None:
223                     addclosure(next, base)
224
225         states = [DFAState(closure(start), finish)]
226         for state in states:  # NB states grows while we're iterating
227             arcs: Dict[str, Dict[NFAState, int]] = {}
228             for nfastate in state.nfaset:
229                 for label, next in nfastate.arcs:
230                     if label is not None:
231                         addclosure(next, arcs.setdefault(label, {}))
232             for label, nfaset in sorted(arcs.items()):
233                 for st in states:
234                     if st.nfaset == nfaset:
235                         break
236                 else:
237                     st = DFAState(nfaset, finish)
238                     states.append(st)
239                 state.addarc(st, label)
240         return states  # List of DFAState instances; first one is start
241
242     def dump_nfa(self, name: str, start: "NFAState", finish: "NFAState") -> None:
243         print("Dump of NFA for", name)
244         todo = [start]
245         for i, state in enumerate(todo):
246             print("  State", i, state is finish and "(final)" or "")
247             for label, next in state.arcs:
248                 if next in todo:
249                     j = todo.index(next)
250                 else:
251                     j = len(todo)
252                     todo.append(next)
253                 if label is None:
254                     print("    -> %d" % j)
255                 else:
256                     print("    %s -> %d" % (label, j))
257
258     def dump_dfa(self, name: str, dfa: Sequence["DFAState"]) -> None:
259         print("Dump of DFA for", name)
260         for i, state in enumerate(dfa):
261             print("  State", i, state.isfinal and "(final)" or "")
262             for label, next in sorted(state.arcs.items()):
263                 print("    %s -> %d" % (label, dfa.index(next)))
264
265     def simplify_dfa(self, dfa: List["DFAState"]) -> None:
266         # This is not theoretically optimal, but works well enough.
267         # Algorithm: repeatedly look for two states that have the same
268         # set of arcs (same labels pointing to the same nodes) and
269         # unify them, until things stop changing.
270
271         # dfa is a list of DFAState instances
272         changes = True
273         while changes:
274             changes = False
275             for i, state_i in enumerate(dfa):
276                 for j in range(i + 1, len(dfa)):
277                     state_j = dfa[j]
278                     if state_i == state_j:
279                         # print "  unify", i, j
280                         del dfa[j]
281                         for state in dfa:
282                             state.unifystate(state_j, state_i)
283                         changes = True
284                         break
285
286     def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
287         # RHS: ALT ('|' ALT)*
288         a, z = self.parse_alt()
289         if self.value != "|":
290             return a, z
291         else:
292             aa = NFAState()
293             zz = NFAState()
294             aa.addarc(a)
295             z.addarc(zz)
296             while self.value == "|":
297                 self.gettoken()
298                 a, z = self.parse_alt()
299                 aa.addarc(a)
300                 z.addarc(zz)
301             return aa, zz
302
303     def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
304         # ALT: ITEM+
305         a, b = self.parse_item()
306         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
307             c, d = self.parse_item()
308             b.addarc(c)
309             b = d
310         return a, b
311
312     def parse_item(self) -> Tuple["NFAState", "NFAState"]:
313         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
314         if self.value == "[":
315             self.gettoken()
316             a, z = self.parse_rhs()
317             self.expect(token.OP, "]")
318             a.addarc(z)
319             return a, z
320         else:
321             a, z = self.parse_atom()
322             value = self.value
323             if value not in ("+", "*"):
324                 return a, z
325             self.gettoken()
326             z.addarc(a)
327             if value == "+":
328                 return a, z
329             else:
330                 return a, a
331
332     def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
333         # ATOM: '(' RHS ')' | NAME | STRING
334         if self.value == "(":
335             self.gettoken()
336             a, z = self.parse_rhs()
337             self.expect(token.OP, ")")
338             return a, z
339         elif self.type in (token.NAME, token.STRING):
340             a = NFAState()
341             z = NFAState()
342             a.addarc(z, self.value)
343             self.gettoken()
344             return a, z
345         else:
346             self.raise_error(
347                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
348             )
349             assert False
350
351     def expect(self, type: int, value: Optional[Any] = None) -> str:
352         if self.type != type or (value is not None and self.value != value):
353             self.raise_error(
354                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
355             )
356         value = self.value
357         self.gettoken()
358         return value
359
360     def gettoken(self) -> None:
361         tup = next(self.generator)
362         while tup[0] in (tokenize.COMMENT, tokenize.NL):
363             tup = next(self.generator)
364         self.type, self.value, self.begin, self.end, self.line = tup
365         # print token.tok_name[self.type], repr(self.value)
366
367     def raise_error(self, msg: str, *args: Any) -> NoReturn:
368         if args:
369             try:
370                 msg = msg % args
371             except:
372                 msg = " ".join([msg] + list(map(str, args)))
373         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
374
375
376 class NFAState:
377     arcs: List[Tuple[Optional[str], "NFAState"]]
378
379     def __init__(self) -> None:
380         self.arcs = []  # list of (label, NFAState) pairs
381
382     def addarc(self, next: "NFAState", label: Optional[str] = None) -> None:
383         assert label is None or isinstance(label, str)
384         assert isinstance(next, NFAState)
385         self.arcs.append((label, next))
386
387
388 class DFAState:
389     nfaset: Dict[NFAState, Any]
390     isfinal: bool
391     arcs: Dict[str, "DFAState"]
392
393     def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
394         assert isinstance(nfaset, dict)
395         assert isinstance(next(iter(nfaset)), NFAState)
396         assert isinstance(final, NFAState)
397         self.nfaset = nfaset
398         self.isfinal = final in nfaset
399         self.arcs = {}  # map from label to DFAState
400
401     def addarc(self, next: "DFAState", label: str) -> None:
402         assert isinstance(label, str)
403         assert label not in self.arcs
404         assert isinstance(next, DFAState)
405         self.arcs[label] = next
406
407     def unifystate(self, old: "DFAState", new: "DFAState") -> None:
408         for label, next in self.arcs.items():
409             if next is old:
410                 self.arcs[label] = new
411
412     def __eq__(self, other: Any) -> bool:
413         # Equality test -- ignore the nfaset instance variable
414         assert isinstance(other, DFAState)
415         if self.isfinal != other.isfinal:
416             return False
417         # Can't just return self.arcs == other.arcs, because that
418         # would invoke this method recursively, with cycles...
419         if len(self.arcs) != len(other.arcs):
420             return False
421         for label, next in self.arcs.items():
422             if next is not other.arcs.get(label):
423                 return False
424         return True
425
426     __hash__: Any = None  # For Py3 compatibility.
427
428
429 def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
430     p = ParserGenerator(filename)
431     return p.make_grammar()