]> git.madduck.net Git - etc/vim.git/blob - blib2to3/pgen2/pgen.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Open temporary files with utf-8 encoding (#126)
[etc/vim.git] / blib2to3 / pgen2 / pgen.py
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
3
4 # Pgen imports
5 from . import grammar, token, tokenize
6
7 class PgenGrammar(grammar.Grammar):
8     pass
9
10 class ParserGenerator(object):
11
12     def __init__(self, filename, stream=None):
13         close_stream = None
14         if stream is None:
15             stream = open(filename)
16             close_stream = stream.close
17         self.filename = filename
18         self.stream = stream
19         self.generator = tokenize.generate_tokens(stream.readline)
20         self.gettoken() # Initialize lookahead
21         self.dfas, self.startsymbol = self.parse()
22         if close_stream is not None:
23             close_stream()
24         self.first = {} # map from symbol name to set of tokens
25         self.addfirstsets()
26
27     def make_grammar(self):
28         c = PgenGrammar()
29         names = list(self.dfas.keys())
30         names.sort()
31         names.remove(self.startsymbol)
32         names.insert(0, self.startsymbol)
33         for name in names:
34             i = 256 + len(c.symbol2number)
35             c.symbol2number[name] = i
36             c.number2symbol[i] = name
37         for name in names:
38             dfa = self.dfas[name]
39             states = []
40             for state in dfa:
41                 arcs = []
42                 for label, next in sorted(state.arcs.items()):
43                     arcs.append((self.make_label(c, label), dfa.index(next)))
44                 if state.isfinal:
45                     arcs.append((0, dfa.index(state)))
46                 states.append(arcs)
47             c.states.append(states)
48             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
49         c.start = c.symbol2number[self.startsymbol]
50         return c
51
52     def make_first(self, c, name):
53         rawfirst = self.first[name]
54         first = {}
55         for label in sorted(rawfirst):
56             ilabel = self.make_label(c, label)
57             ##assert ilabel not in first # XXX failed on <> ... !=
58             first[ilabel] = 1
59         return first
60
61     def make_label(self, c, label):
62         # XXX Maybe this should be a method on a subclass of converter?
63         ilabel = len(c.labels)
64         if label[0].isalpha():
65             # Either a symbol name or a named token
66             if label in c.symbol2number:
67                 # A symbol name (a non-terminal)
68                 if label in c.symbol2label:
69                     return c.symbol2label[label]
70                 else:
71                     c.labels.append((c.symbol2number[label], None))
72                     c.symbol2label[label] = ilabel
73                     return ilabel
74             else:
75                 # A named token (NAME, NUMBER, STRING)
76                 itoken = getattr(token, label, None)
77                 assert isinstance(itoken, int), label
78                 assert itoken in token.tok_name, label
79                 if itoken in c.tokens:
80                     return c.tokens[itoken]
81                 else:
82                     c.labels.append((itoken, None))
83                     c.tokens[itoken] = ilabel
84                     return ilabel
85         else:
86             # Either a keyword or an operator
87             assert label[0] in ('"', "'"), label
88             value = eval(label)
89             if value[0].isalpha():
90                 # A keyword
91                 if value in c.keywords:
92                     return c.keywords[value]
93                 else:
94                     c.labels.append((token.NAME, value))
95                     c.keywords[value] = ilabel
96                     return ilabel
97             else:
98                 # An operator (any non-numeric token)
99                 itoken = grammar.opmap[value] # Fails if unknown token
100                 if itoken in c.tokens:
101                     return c.tokens[itoken]
102                 else:
103                     c.labels.append((itoken, None))
104                     c.tokens[itoken] = ilabel
105                     return ilabel
106
107     def addfirstsets(self):
108         names = list(self.dfas.keys())
109         names.sort()
110         for name in names:
111             if name not in self.first:
112                 self.calcfirst(name)
113             #print name, self.first[name].keys()
114
115     def calcfirst(self, name):
116         dfa = self.dfas[name]
117         self.first[name] = None # dummy to detect left recursion
118         state = dfa[0]
119         totalset = {}
120         overlapcheck = {}
121         for label, next in state.arcs.items():
122             if label in self.dfas:
123                 if label in self.first:
124                     fset = self.first[label]
125                     if fset is None:
126                         raise ValueError("recursion for rule %r" % name)
127                 else:
128                     self.calcfirst(label)
129                     fset = self.first[label]
130                 totalset.update(fset)
131                 overlapcheck[label] = fset
132             else:
133                 totalset[label] = 1
134                 overlapcheck[label] = {label: 1}
135         inverse = {}
136         for label, itsfirst in overlapcheck.items():
137             for symbol in itsfirst:
138                 if symbol in inverse:
139                     raise ValueError("rule %s is ambiguous; %s is in the"
140                                      " first sets of %s as well as %s" %
141                                      (name, symbol, label, inverse[symbol]))
142                 inverse[symbol] = label
143         self.first[name] = totalset
144
145     def parse(self):
146         dfas = {}
147         startsymbol = None
148         # MSTART: (NEWLINE | RULE)* ENDMARKER
149         while self.type != token.ENDMARKER:
150             while self.type == token.NEWLINE:
151                 self.gettoken()
152             # RULE: NAME ':' RHS NEWLINE
153             name = self.expect(token.NAME)
154             self.expect(token.OP, ":")
155             a, z = self.parse_rhs()
156             self.expect(token.NEWLINE)
157             #self.dump_nfa(name, a, z)
158             dfa = self.make_dfa(a, z)
159             #self.dump_dfa(name, dfa)
160             oldlen = len(dfa)
161             self.simplify_dfa(dfa)
162             newlen = len(dfa)
163             dfas[name] = dfa
164             #print name, oldlen, newlen
165             if startsymbol is None:
166                 startsymbol = name
167         return dfas, startsymbol
168
169     def make_dfa(self, start, finish):
170         # To turn an NFA into a DFA, we define the states of the DFA
171         # to correspond to *sets* of states of the NFA.  Then do some
172         # state reduction.  Let's represent sets as dicts with 1 for
173         # values.
174         assert isinstance(start, NFAState)
175         assert isinstance(finish, NFAState)
176         def closure(state):
177             base = {}
178             addclosure(state, base)
179             return base
180         def addclosure(state, base):
181             assert isinstance(state, NFAState)
182             if state in base:
183                 return
184             base[state] = 1
185             for label, next in state.arcs:
186                 if label is None:
187                     addclosure(next, base)
188         states = [DFAState(closure(start), finish)]
189         for state in states: # NB states grows while we're iterating
190             arcs = {}
191             for nfastate in state.nfaset:
192                 for label, next in nfastate.arcs:
193                     if label is not None:
194                         addclosure(next, arcs.setdefault(label, {}))
195             for label, nfaset in sorted(arcs.items()):
196                 for st in states:
197                     if st.nfaset == nfaset:
198                         break
199                 else:
200                     st = DFAState(nfaset, finish)
201                     states.append(st)
202                 state.addarc(st, label)
203         return states # List of DFAState instances; first one is start
204
205     def dump_nfa(self, name, start, finish):
206         print("Dump of NFA for", name)
207         todo = [start]
208         for i, state in enumerate(todo):
209             print("  State", i, state is finish and "(final)" or "")
210             for label, next in state.arcs:
211                 if next in todo:
212                     j = todo.index(next)
213                 else:
214                     j = len(todo)
215                     todo.append(next)
216                 if label is None:
217                     print("    -> %d" % j)
218                 else:
219                     print("    %s -> %d" % (label, j))
220
221     def dump_dfa(self, name, dfa):
222         print("Dump of DFA for", name)
223         for i, state in enumerate(dfa):
224             print("  State", i, state.isfinal and "(final)" or "")
225             for label, next in sorted(state.arcs.items()):
226                 print("    %s -> %d" % (label, dfa.index(next)))
227
228     def simplify_dfa(self, dfa):
229         # This is not theoretically optimal, but works well enough.
230         # Algorithm: repeatedly look for two states that have the same
231         # set of arcs (same labels pointing to the same nodes) and
232         # unify them, until things stop changing.
233
234         # dfa is a list of DFAState instances
235         changes = True
236         while changes:
237             changes = False
238             for i, state_i in enumerate(dfa):
239                 for j in range(i+1, len(dfa)):
240                     state_j = dfa[j]
241                     if state_i == state_j:
242                         #print "  unify", i, j
243                         del dfa[j]
244                         for state in dfa:
245                             state.unifystate(state_j, state_i)
246                         changes = True
247                         break
248
249     def parse_rhs(self):
250         # RHS: ALT ('|' ALT)*
251         a, z = self.parse_alt()
252         if self.value != "|":
253             return a, z
254         else:
255             aa = NFAState()
256             zz = NFAState()
257             aa.addarc(a)
258             z.addarc(zz)
259             while self.value == "|":
260                 self.gettoken()
261                 a, z = self.parse_alt()
262                 aa.addarc(a)
263                 z.addarc(zz)
264             return aa, zz
265
266     def parse_alt(self):
267         # ALT: ITEM+
268         a, b = self.parse_item()
269         while (self.value in ("(", "[") or
270                self.type in (token.NAME, token.STRING)):
271             c, d = self.parse_item()
272             b.addarc(c)
273             b = d
274         return a, b
275
276     def parse_item(self):
277         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
278         if self.value == "[":
279             self.gettoken()
280             a, z = self.parse_rhs()
281             self.expect(token.OP, "]")
282             a.addarc(z)
283             return a, z
284         else:
285             a, z = self.parse_atom()
286             value = self.value
287             if value not in ("+", "*"):
288                 return a, z
289             self.gettoken()
290             z.addarc(a)
291             if value == "+":
292                 return a, z
293             else:
294                 return a, a
295
296     def parse_atom(self):
297         # ATOM: '(' RHS ')' | NAME | STRING
298         if self.value == "(":
299             self.gettoken()
300             a, z = self.parse_rhs()
301             self.expect(token.OP, ")")
302             return a, z
303         elif self.type in (token.NAME, token.STRING):
304             a = NFAState()
305             z = NFAState()
306             a.addarc(z, self.value)
307             self.gettoken()
308             return a, z
309         else:
310             self.raise_error("expected (...) or NAME or STRING, got %s/%s",
311                              self.type, self.value)
312
313     def expect(self, type, value=None):
314         if self.type != type or (value is not None and self.value != value):
315             self.raise_error("expected %s/%s, got %s/%s",
316                              type, value, self.type, self.value)
317         value = self.value
318         self.gettoken()
319         return value
320
321     def gettoken(self):
322         tup = next(self.generator)
323         while tup[0] in (tokenize.COMMENT, tokenize.NL):
324             tup = next(self.generator)
325         self.type, self.value, self.begin, self.end, self.line = tup
326         #print token.tok_name[self.type], repr(self.value)
327
328     def raise_error(self, msg, *args):
329         if args:
330             try:
331                 msg = msg % args
332             except:
333                 msg = " ".join([msg] + list(map(str, args)))
334         raise SyntaxError(msg, (self.filename, self.end[0],
335                                 self.end[1], self.line))
336
337 class NFAState(object):
338
339     def __init__(self):
340         self.arcs = [] # list of (label, NFAState) pairs
341
342     def addarc(self, next, label=None):
343         assert label is None or isinstance(label, str)
344         assert isinstance(next, NFAState)
345         self.arcs.append((label, next))
346
347 class DFAState(object):
348
349     def __init__(self, nfaset, final):
350         assert isinstance(nfaset, dict)
351         assert isinstance(next(iter(nfaset)), NFAState)
352         assert isinstance(final, NFAState)
353         self.nfaset = nfaset
354         self.isfinal = final in nfaset
355         self.arcs = {} # map from label to DFAState
356
357     def addarc(self, next, label):
358         assert isinstance(label, str)
359         assert label not in self.arcs
360         assert isinstance(next, DFAState)
361         self.arcs[label] = next
362
363     def unifystate(self, old, new):
364         for label, next in self.arcs.items():
365             if next is old:
366                 self.arcs[label] = new
367
368     def __eq__(self, other):
369         # Equality test -- ignore the nfaset instance variable
370         assert isinstance(other, DFAState)
371         if self.isfinal != other.isfinal:
372             return False
373         # Can't just return self.arcs == other.arcs, because that
374         # would invoke this method recursively, with cycles...
375         if len(self.arcs) != len(other.arcs):
376             return False
377         for label, next in self.arcs.items():
378             if next is not other.arcs.get(label):
379                 return False
380         return True
381
382     __hash__ = None # For Py3 compatibility.
383
384 def generate_grammar(filename="Grammar.txt"):
385     p = ParserGenerator(filename)
386     return p.make_grammar()