All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
6 """Convert graminit.[ch] spit out by pgen to Python code.
8 Pgen is the Python parser generator. It is useful to quickly create a
9 parser from a grammar file in Python's grammar notation. But I don't
10 want my parsers to be written in C (yet), so I'm translating the
11 parsing tables to Python data structures and writing a Python parse
14 Note that the token numbers are constants determined by the standard
15 Python tokenizer. The standard token module defines these numbers and
16 their names (the names are not used much). The token numbers are
17 hardcoded into the Python tokenizer and into pgen. A Python
18 implementation of the Python tokenizer is also available, in the
19 standard tokenize module.
21 On the other hand, symbol numbers (representing the grammar's
22 non-terminals) are assigned by pgen based on the actual grammar
25 Note: this module is pretty much obsolete; the pgen module generates
26 equivalent grammar tables directly from the Grammar.txt input file
27 without having to invoke the Python pgen C program.
35 from pgen2 import grammar, token
38 class Converter(grammar.Grammar):
39 """Grammar subclass that reads classic pgen output files.
41 The run() method reads the tables as produced by the pgen parser
42 generator, typically contained in two C files, graminit.h and
43 graminit.c. The other methods are for internal use only.
45 See the base class for more documentation.
49 def run(self, graminit_h, graminit_c):
50 """Load the grammar tables from the text files written by pgen."""
51 self.parse_graminit_h(graminit_h)
52 self.parse_graminit_c(graminit_c)
55 def parse_graminit_h(self, filename):
56 """Parse the .h file written by pgen. (Internal)
58 This file is a sequence of #define statements defining the
59 nonterminals of the grammar as numbers. We build two tables
60 mapping the numbers to names and back.
65 except OSError as err:
66 print(f"Can't open {filename}: {err}")
68 self.symbol2number = {}
69 self.number2symbol = {}
73 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
74 if not mo and line.strip():
75 print(f"{filename}({lineno}): can't parse {line.strip()}")
77 symbol, number = mo.groups()
79 assert symbol not in self.symbol2number
80 assert number not in self.number2symbol
81 self.symbol2number[symbol] = number
82 self.number2symbol[number] = symbol
85 def parse_graminit_c(self, filename):
86 """Parse the .c file written by pgen. (Internal)
88 The file looks as follows. The first two lines are always this:
90 #include "pgenheaders.h"
93 After that come four blocks:
95 1) one or more state definitions
96 2) a table defining dfas
97 3) a table defining labels
98 4) a struct defining the grammar
100 A state definition has the following form:
101 - one or more arc arrays, each of the form:
102 static arc arcs_<n>_<m>[<k>] = {
106 - followed by a state array, of the form:
107 static state states_<s>[<t>] = {
115 except OSError as err:
116 print(f"Can't open {filename}: {err}")
118 # The code below essentially uses f's iterator-ness!
121 # Expect the two #include lines
122 lineno, line = lineno + 1, next(f)
123 assert line == '#include "pgenheaders.h"\n', (lineno, line)
124 lineno, line = lineno + 1, next(f)
125 assert line == '#include "grammar.h"\n', (lineno, line)
127 # Parse the state definitions
128 lineno, line = lineno + 1, next(f)
131 while line.startswith("static arc "):
132 while line.startswith("static arc "):
133 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", line)
134 assert mo, (lineno, line)
135 n, m, k = list(map(int, mo.groups()))
138 lineno, line = lineno + 1, next(f)
139 mo = re.match(r"\s+{(\d+), (\d+)},$", line)
140 assert mo, (lineno, line)
141 i, j = list(map(int, mo.groups()))
143 lineno, line = lineno + 1, next(f)
144 assert line == "};\n", (lineno, line)
145 allarcs[(n, m)] = arcs
146 lineno, line = lineno + 1, next(f)
147 mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
148 assert mo, (lineno, line)
149 s, t = list(map(int, mo.groups()))
150 assert s == len(states), (lineno, line)
153 lineno, line = lineno + 1, next(f)
154 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
155 assert mo, (lineno, line)
156 k, n, m = list(map(int, mo.groups()))
158 assert k == len(arcs), (lineno, line)
161 lineno, line = lineno + 1, next(f)
162 assert line == "};\n", (lineno, line)
163 lineno, line = lineno + 1, next(f)
168 mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
169 assert mo, (lineno, line)
170 ndfas = int(mo.group(1))
171 for i in range(ndfas):
172 lineno, line = lineno + 1, next(f)
173 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line)
174 assert mo, (lineno, line)
176 number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
177 assert self.symbol2number[symbol] == number, (lineno, line)
178 assert self.number2symbol[number] == symbol, (lineno, line)
179 assert x == 0, (lineno, line)
181 assert y == len(state), (lineno, line)
182 lineno, line = lineno + 1, next(f)
183 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
184 assert mo, (lineno, line)
186 rawbitset = eval(mo.group(1))
187 for i, c in enumerate(rawbitset):
192 dfas[number] = (state, first)
193 lineno, line = lineno + 1, next(f)
194 assert line == "};\n", (lineno, line)
199 lineno, line = lineno + 1, next(f)
200 mo = re.match(r"static label labels\[(\d+)\] = {$", line)
201 assert mo, (lineno, line)
202 nlabels = int(mo.group(1))
203 for i in range(nlabels):
204 lineno, line = lineno + 1, next(f)
205 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
206 assert mo, (lineno, line)
213 labels.append((x, y))
214 lineno, line = lineno + 1, next(f)
215 assert line == "};\n", (lineno, line)
218 # Parse the grammar struct
219 lineno, line = lineno + 1, next(f)
220 assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
221 lineno, line = lineno + 1, next(f)
222 mo = re.match(r"\s+(\d+),$", line)
223 assert mo, (lineno, line)
224 ndfas = int(mo.group(1))
225 assert ndfas == len(self.dfas)
226 lineno, line = lineno + 1, next(f)
227 assert line == "\tdfas,\n", (lineno, line)
228 lineno, line = lineno + 1, next(f)
229 mo = re.match(r"\s+{(\d+), labels},$", line)
230 assert mo, (lineno, line)
231 nlabels = int(mo.group(1))
232 assert nlabels == len(self.labels), (lineno, line)
233 lineno, line = lineno + 1, next(f)
234 mo = re.match(r"\s+(\d+)$", line)
235 assert mo, (lineno, line)
236 start = int(mo.group(1))
237 assert start in self.number2symbol, (lineno, line)
239 lineno, line = lineno + 1, next(f)
240 assert line == "};\n", (lineno, line)
242 lineno, line = lineno + 1, next(f)
243 except StopIteration:
246 assert 0, (lineno, line)
248 def finish_off(self):
249 """Create additional useful structures. (Internal)."""
250 self.keywords = {} # map from keyword strings to arc labels
251 self.tokens = {} # map from numeric token values to arc labels
252 for ilabel, (type, value) in enumerate(self.labels):
253 if type == token.NAME and value is not None:
254 self.keywords[value] = ilabel
256 self.tokens[type] = ilabel