]> git.madduck.net Git - etc/vim.git/blob - blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

coverage: omit tests/data (#1095)
[etc/vim.git] / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7
8 class PgenGrammar(grammar.Grammar):
9     pass
10
11
12 class ParserGenerator(object):
13     def __init__(self, filename, stream=None):
14         close_stream = None
15         if stream is None:
16             stream = open(filename)
17             close_stream = stream.close
18         self.filename = filename
19         self.stream = stream
20         self.generator = tokenize.generate_tokens(stream.readline)
21         self.gettoken()  # Initialize lookahead
22         self.dfas, self.startsymbol = self.parse()
23         if close_stream is not None:
24             close_stream()
25         self.first = {}  # map from symbol name to set of tokens
26         self.addfirstsets()
27
28     def make_grammar(self):
29         c = PgenGrammar()
30         names = list(self.dfas.keys())
31         names.sort()
32         names.remove(self.startsymbol)
33         names.insert(0, self.startsymbol)
34         for name in names:
35             i = 256 + len(c.symbol2number)
36             c.symbol2number[name] = i
37             c.number2symbol[i] = name
38         for name in names:
39             dfa = self.dfas[name]
40             states = []
41             for state in dfa:
42                 arcs = []
43                 for label, next in sorted(state.arcs.items()):
44                     arcs.append((self.make_label(c, label), dfa.index(next)))
45                 if state.isfinal:
46                     arcs.append((0, dfa.index(state)))
47                 states.append(arcs)
48             c.states.append(states)
49             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
50         c.start = c.symbol2number[self.startsymbol]
51         return c
52
53     def make_first(self, c, name):
54         rawfirst = self.first[name]
55         first = {}
56         for label in sorted(rawfirst):
57             ilabel = self.make_label(c, label)
58             ##assert ilabel not in first # XXX failed on <> ... !=
59             first[ilabel] = 1
60         return first
61
62     def make_label(self, c, label):
63         # XXX Maybe this should be a method on a subclass of converter?
64         ilabel = len(c.labels)
65         if label[0].isalpha():
66             # Either a symbol name or a named token
67             if label in c.symbol2number:
68                 # A symbol name (a non-terminal)
69                 if label in c.symbol2label:
70                     return c.symbol2label[label]
71                 else:
72                     c.labels.append((c.symbol2number[label], None))
73                     c.symbol2label[label] = ilabel
74                     return ilabel
75             else:
76                 # A named token (NAME, NUMBER, STRING)
77                 itoken = getattr(token, label, None)
78                 assert isinstance(itoken, int), label
79                 assert itoken in token.tok_name, label
80                 if itoken in c.tokens:
81                     return c.tokens[itoken]
82                 else:
83                     c.labels.append((itoken, None))
84                     c.tokens[itoken] = ilabel
85                     return ilabel
86         else:
87             # Either a keyword or an operator
88             assert label[0] in ('"', "'"), label
89             value = eval(label)
90             if value[0].isalpha():
91                 # A keyword
92                 if value in c.keywords:
93                     return c.keywords[value]
94                 else:
95                     c.labels.append((token.NAME, value))
96                     c.keywords[value] = ilabel
97                     return ilabel
98             else:
99                 # An operator (any non-numeric token)
100                 itoken = grammar.opmap[value]  # Fails if unknown token
101                 if itoken in c.tokens:
102                     return c.tokens[itoken]
103                 else:
104                     c.labels.append((itoken, None))
105                     c.tokens[itoken] = ilabel
106                     return ilabel
107
108     def addfirstsets(self):
109         names = list(self.dfas.keys())
110         names.sort()
111         for name in names:
112             if name not in self.first:
113                 self.calcfirst(name)
114             # print name, self.first[name].keys()
115
116     def calcfirst(self, name):
117         dfa = self.dfas[name]
118         self.first[name] = None  # dummy to detect left recursion
119         state = dfa[0]
120         totalset = {}
121         overlapcheck = {}
122         for label, next in state.arcs.items():
123             if label in self.dfas:
124                 if label in self.first:
125                     fset = self.first[label]
126                     if fset is None:
127                         raise ValueError("recursion for rule %r" % name)
128                 else:
129                     self.calcfirst(label)
130                     fset = self.first[label]
131                 totalset.update(fset)
132                 overlapcheck[label] = fset
133             else:
134                 totalset[label] = 1
135                 overlapcheck[label] = {label: 1}
136         inverse = {}
137         for label, itsfirst in overlapcheck.items():
138             for symbol in itsfirst:
139                 if symbol in inverse:
140                     raise ValueError(
141                         "rule %s is ambiguous; %s is in the"
142                         " first sets of %s as well as %s"
143                         % (name, symbol, label, inverse[symbol])
144                     )
145                 inverse[symbol] = label
146         self.first[name] = totalset
147
148     def parse(self):
149         dfas = {}
150         startsymbol = None
151         # MSTART: (NEWLINE | RULE)* ENDMARKER
152         while self.type != token.ENDMARKER:
153             while self.type == token.NEWLINE:
154                 self.gettoken()
155             # RULE: NAME ':' RHS NEWLINE
156             name = self.expect(token.NAME)
157             self.expect(token.OP, ":")
158             a, z = self.parse_rhs()
159             self.expect(token.NEWLINE)
160             # self.dump_nfa(name, a, z)
161             dfa = self.make_dfa(a, z)
162             # self.dump_dfa(name, dfa)
163             oldlen = len(dfa)
164             self.simplify_dfa(dfa)
165             newlen = len(dfa)
166             dfas[name] = dfa
167             # print name, oldlen, newlen
168             if startsymbol is None:
169                 startsymbol = name
170         return dfas, startsymbol
171
172     def make_dfa(self, start, finish):
173         # To turn an NFA into a DFA, we define the states of the DFA
174         # to correspond to *sets* of states of the NFA.  Then do some
175         # state reduction.  Let's represent sets as dicts with 1 for
176         # values.
177         assert isinstance(start, NFAState)
178         assert isinstance(finish, NFAState)
179
180         def closure(state):
181             base = {}
182             addclosure(state, base)
183             return base
184
185         def addclosure(state, base):
186             assert isinstance(state, NFAState)
187             if state in base:
188                 return
189             base[state] = 1
190             for label, next in state.arcs:
191                 if label is None:
192                     addclosure(next, base)
193
194         states = [DFAState(closure(start), finish)]
195         for state in states:  # NB states grows while we're iterating
196             arcs = {}
197             for nfastate in state.nfaset:
198                 for label, next in nfastate.arcs:
199                     if label is not None:
200                         addclosure(next, arcs.setdefault(label, {}))
201             for label, nfaset in sorted(arcs.items()):
202                 for st in states:
203                     if st.nfaset == nfaset:
204                         break
205                 else:
206                     st = DFAState(nfaset, finish)
207                     states.append(st)
208                 state.addarc(st, label)
209         return states  # List of DFAState instances; first one is start
210
211     def dump_nfa(self, name, start, finish):
212         print("Dump of NFA for", name)
213         todo = [start]
214         for i, state in enumerate(todo):
215             print("  State", i, state is finish and "(final)" or "")
216             for label, next in state.arcs:
217                 if next in todo:
218                     j = todo.index(next)
219                 else:
220                     j = len(todo)
221                     todo.append(next)
222                 if label is None:
223                     print("    -> %d" % j)
224                 else:
225                     print("    %s -> %d" % (label, j))
226
227     def dump_dfa(self, name, dfa):
228         print("Dump of DFA for", name)
229         for i, state in enumerate(dfa):
230             print("  State", i, state.isfinal and "(final)" or "")
231             for label, next in sorted(state.arcs.items()):
232                 print("    %s -> %d" % (label, dfa.index(next)))
233
234     def simplify_dfa(self, dfa):
235         # This is not theoretically optimal, but works well enough.
236         # Algorithm: repeatedly look for two states that have the same
237         # set of arcs (same labels pointing to the same nodes) and
238         # unify them, until things stop changing.
239
240         # dfa is a list of DFAState instances
241         changes = True
242         while changes:
243             changes = False
244             for i, state_i in enumerate(dfa):
245                 for j in range(i + 1, len(dfa)):
246                     state_j = dfa[j]
247                     if state_i == state_j:
248                         # print "  unify", i, j
249                         del dfa[j]
250                         for state in dfa:
251                             state.unifystate(state_j, state_i)
252                         changes = True
253                         break
254
255     def parse_rhs(self):
256         # RHS: ALT ('|' ALT)*
257         a, z = self.parse_alt()
258         if self.value != "|":
259             return a, z
260         else:
261             aa = NFAState()
262             zz = NFAState()
263             aa.addarc(a)
264             z.addarc(zz)
265             while self.value == "|":
266                 self.gettoken()
267                 a, z = self.parse_alt()
268                 aa.addarc(a)
269                 z.addarc(zz)
270             return aa, zz
271
272     def parse_alt(self):
273         # ALT: ITEM+
274         a, b = self.parse_item()
275         while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
276             c, d = self.parse_item()
277             b.addarc(c)
278             b = d
279         return a, b
280
281     def parse_item(self):
282         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
283         if self.value == "[":
284             self.gettoken()
285             a, z = self.parse_rhs()
286             self.expect(token.OP, "]")
287             a.addarc(z)
288             return a, z
289         else:
290             a, z = self.parse_atom()
291             value = self.value
292             if value not in ("+", "*"):
293                 return a, z
294             self.gettoken()
295             z.addarc(a)
296             if value == "+":
297                 return a, z
298             else:
299                 return a, a
300
301     def parse_atom(self):
302         # ATOM: '(' RHS ')' | NAME | STRING
303         if self.value == "(":
304             self.gettoken()
305             a, z = self.parse_rhs()
306             self.expect(token.OP, ")")
307             return a, z
308         elif self.type in (token.NAME, token.STRING):
309             a = NFAState()
310             z = NFAState()
311             a.addarc(z, self.value)
312             self.gettoken()
313             return a, z
314         else:
315             self.raise_error(
316                 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value
317             )
318
319     def expect(self, type, value=None):
320         if self.type != type or (value is not None and self.value != value):
321             self.raise_error(
322                 "expected %s/%s, got %s/%s", type, value, self.type, self.value
323             )
324         value = self.value
325         self.gettoken()
326         return value
327
328     def gettoken(self):
329         tup = next(self.generator)
330         while tup[0] in (tokenize.COMMENT, tokenize.NL):
331             tup = next(self.generator)
332         self.type, self.value, self.begin, self.end, self.line = tup
333         # print token.tok_name[self.type], repr(self.value)
334
335     def raise_error(self, msg, *args):
336         if args:
337             try:
338                 msg = msg % args
339             except:
340                 msg = " ".join([msg] + list(map(str, args)))
341         raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
342
343
344 class NFAState(object):
345     def __init__(self):
346         self.arcs = []  # list of (label, NFAState) pairs
347
348     def addarc(self, next, label=None):
349         assert label is None or isinstance(label, str)
350         assert isinstance(next, NFAState)
351         self.arcs.append((label, next))
352
353
354 class DFAState(object):
355     def __init__(self, nfaset, final):
356         assert isinstance(nfaset, dict)
357         assert isinstance(next(iter(nfaset)), NFAState)
358         assert isinstance(final, NFAState)
359         self.nfaset = nfaset
360         self.isfinal = final in nfaset
361         self.arcs = {}  # map from label to DFAState
362
363     def addarc(self, next, label):
364         assert isinstance(label, str)
365         assert label not in self.arcs
366         assert isinstance(next, DFAState)
367         self.arcs[label] = next
368
369     def unifystate(self, old, new):
370         for label, next in self.arcs.items():
371             if next is old:
372                 self.arcs[label] = new
373
374     def __eq__(self, other):
375         # Equality test -- ignore the nfaset instance variable
376         assert isinstance(other, DFAState)
377         if self.isfinal != other.isfinal:
378             return False
379         # Can't just return self.arcs == other.arcs, because that
380         # would invoke this method recursively, with cycles...
381         if len(self.arcs) != len(other.arcs):
382             return False
383         for label, next in self.arcs.items():
384             if next is not other.arcs.get(label):
385                 return False
386         return True
387
388     __hash__ = None  # For Py3 compatibility.
389
390
391 def generate_grammar(filename="Grammar.txt"):
392     p = ParserGenerator(filename)
393     return p.make_grammar()