All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """Convert graminit.[ch] spit out by pgen to Python code.
6 Pgen is the Python parser generator. It is useful to quickly create a
7 parser from a grammar file in Python's grammar notation. But I don't
8 want my parsers to be written in C (yet), so I'm translating the
9 parsing tables to Python data structures and writing a Python parse
12 Note that the token numbers are constants determined by the standard
13 Python tokenizer. The standard token module defines these numbers and
14 their names (the names are not used much). The token numbers are
15 hardcoded into the Python tokenizer and into pgen. A Python
16 implementation of the Python tokenizer is also available, in the
17 standard tokenize module.
19 On the other hand, symbol numbers (representing the grammar's
20 non-terminals) are assigned by pgen based on the actual grammar
23 Note: this module is pretty much obsolete; the pgen module generates
24 equivalent grammar tables directly from the Grammar.txt input file
25 without having to invoke the Python pgen C program.
33 from pgen2 import grammar, token
36 class Converter(grammar.Grammar):
37 """Grammar subclass that reads classic pgen output files.
39 The run() method reads the tables as produced by the pgen parser
40 generator, typically contained in two C files, graminit.h and
41 graminit.c. The other methods are for internal use only.
43 See the base class for more documentation.
47 def run(self, graminit_h, graminit_c):
48 """Load the grammar tables from the text files written by pgen."""
49 self.parse_graminit_h(graminit_h)
50 self.parse_graminit_c(graminit_c)
53 def parse_graminit_h(self, filename):
54 """Parse the .h file written by pgen. (Internal)
56 This file is a sequence of #define statements defining the
57 nonterminals of the grammar as numbers. We build two tables
58 mapping the numbers to names and back.
63 except OSError as err:
64 print("Can't open %s: %s" % (filename, err))
66 self.symbol2number = {}
67 self.number2symbol = {}
71 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
72 if not mo and line.strip():
73 print("%s(%s): can't parse %s" % (filename, lineno,
76 symbol, number = mo.groups()
78 assert symbol not in self.symbol2number
79 assert number not in self.number2symbol
80 self.symbol2number[symbol] = number
81 self.number2symbol[number] = symbol
84 def parse_graminit_c(self, filename):
85 """Parse the .c file written by pgen. (Internal)
87 The file looks as follows. The first two lines are always this:
89 #include "pgenheaders.h"
92 After that come four blocks:
94 1) one or more state definitions
95 2) a table defining dfas
96 3) a table defining labels
97 4) a struct defining the grammar
99 A state definition has the following form:
100 - one or more arc arrays, each of the form:
101 static arc arcs_<n>_<m>[<k>] = {
105 - followed by a state array, of the form:
106 static state states_<s>[<t>] = {
114 except OSError as err:
115 print("Can't open %s: %s" % (filename, err))
117 # The code below essentially uses f's iterator-ness!
120 # Expect the two #include lines
121 lineno, line = lineno+1, next(f)
122 assert line == '#include "pgenheaders.h"\n', (lineno, line)
123 lineno, line = lineno+1, next(f)
124 assert line == '#include "grammar.h"\n', (lineno, line)
126 # Parse the state definitions
127 lineno, line = lineno+1, next(f)
130 while line.startswith("static arc "):
131 while line.startswith("static arc "):
132 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
134 assert mo, (lineno, line)
135 n, m, k = list(map(int, mo.groups()))
138 lineno, line = lineno+1, next(f)
139 mo = re.match(r"\s+{(\d+), (\d+)},$", line)
140 assert mo, (lineno, line)
141 i, j = list(map(int, mo.groups()))
143 lineno, line = lineno+1, next(f)
144 assert line == "};\n", (lineno, line)
145 allarcs[(n, m)] = arcs
146 lineno, line = lineno+1, next(f)
147 mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
148 assert mo, (lineno, line)
149 s, t = list(map(int, mo.groups()))
150 assert s == len(states), (lineno, line)
153 lineno, line = lineno+1, next(f)
154 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
155 assert mo, (lineno, line)
156 k, n, m = list(map(int, mo.groups()))
158 assert k == len(arcs), (lineno, line)
161 lineno, line = lineno+1, next(f)
162 assert line == "};\n", (lineno, line)
163 lineno, line = lineno+1, next(f)
168 mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
169 assert mo, (lineno, line)
170 ndfas = int(mo.group(1))
171 for i in range(ndfas):
172 lineno, line = lineno+1, next(f)
173 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
175 assert mo, (lineno, line)
177 number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
178 assert self.symbol2number[symbol] == number, (lineno, line)
179 assert self.number2symbol[number] == symbol, (lineno, line)
180 assert x == 0, (lineno, line)
182 assert y == len(state), (lineno, line)
183 lineno, line = lineno+1, next(f)
184 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
185 assert mo, (lineno, line)
187 rawbitset = eval(mo.group(1))
188 for i, c in enumerate(rawbitset):
193 dfas[number] = (state, first)
194 lineno, line = lineno+1, next(f)
195 assert line == "};\n", (lineno, line)
200 lineno, line = lineno+1, next(f)
201 mo = re.match(r"static label labels\[(\d+)\] = {$", line)
202 assert mo, (lineno, line)
203 nlabels = int(mo.group(1))
204 for i in range(nlabels):
205 lineno, line = lineno+1, next(f)
206 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
207 assert mo, (lineno, line)
214 labels.append((x, y))
215 lineno, line = lineno+1, next(f)
216 assert line == "};\n", (lineno, line)
219 # Parse the grammar struct
220 lineno, line = lineno+1, next(f)
221 assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
222 lineno, line = lineno+1, next(f)
223 mo = re.match(r"\s+(\d+),$", line)
224 assert mo, (lineno, line)
225 ndfas = int(mo.group(1))
226 assert ndfas == len(self.dfas)
227 lineno, line = lineno+1, next(f)
228 assert line == "\tdfas,\n", (lineno, line)
229 lineno, line = lineno+1, next(f)
230 mo = re.match(r"\s+{(\d+), labels},$", line)
231 assert mo, (lineno, line)
232 nlabels = int(mo.group(1))
233 assert nlabels == len(self.labels), (lineno, line)
234 lineno, line = lineno+1, next(f)
235 mo = re.match(r"\s+(\d+)$", line)
236 assert mo, (lineno, line)
237 start = int(mo.group(1))
238 assert start in self.number2symbol, (lineno, line)
240 lineno, line = lineno+1, next(f)
241 assert line == "};\n", (lineno, line)
243 lineno, line = lineno+1, next(f)
244 except StopIteration:
247 assert 0, (lineno, line)
249 def finish_off(self):
250 """Create additional useful structures. (Internal)."""
251 self.keywords = {} # map from keyword strings to arc labels
252 self.tokens = {} # map from numeric token values to arc labels
253 for ilabel, (type, value) in enumerate(self.labels):
254 if type == token.NAME and value is not None:
255 self.keywords[value] = ilabel
257 self.tokens[type] = ilabel