blob: bf49762ae48234428637f7aaad792b888a6354b6 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""Convert graminit.[ch] spit out by pgen to Python code.
5
6Pgen is the Python parser generator. It is useful to quickly create a
7parser from a grammar file in Python's grammar notation. But I don't
8want my parsers to be written in C (yet), so I'm translating the
9parsing tables to Python data structures and writing a Python parse
10engine.
11
12Note that the token numbers are constants determined by the standard
13Python tokenizer. The standard token module defines these numbers and
14their names (the names are not used much). The token numbers are
15hardcoded into the Python tokenizer and into pgen. A Python
16implementation of the Python tokenizer is also available, in the
17standard tokenize module.
18
19On the other hand, symbol numbers (representing the grammar's
20non-terminals) are assigned by pgen based on the actual grammar
21input.
22
23Note: this module is pretty much obsolete; the pgen module generates
24equivalent grammar tables directly from the Grammar.txt input file
25without having to invoke the Python pgen C program.
26
27"""
28
29# Python imports
30import re
31
32# Local imports
33from pgen2 import grammar, token
34
35
36class Converter(grammar.Grammar):
37 """Grammar subclass that reads classic pgen output files.
38
39 The run() method reads the tables as produced by the pgen parser
40 generator, typically contained in two C files, graminit.h and
41 graminit.c. The other methods are for internal use only.
42
43 See the base class for more documentation.
44
45 """
46
47 def run(self, graminit_h, graminit_c):
48 """Load the grammar tables from the text files written by pgen."""
49 self.parse_graminit_h(graminit_h)
50 self.parse_graminit_c(graminit_c)
51 self.finish_off()
52
53 def parse_graminit_h(self, filename):
Ezio Melotti13925002011-03-16 11:05:33 +020054 """Parse the .h file written by pgen. (Internal)
Martin v. Löwisef04c442008-03-19 05:04:44 +000055
56 This file is a sequence of #define statements defining the
57 nonterminals of the grammar as numbers. We build two tables
58 mapping the numbers to names and back.
59
60 """
61 try:
62 f = open(filename)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000063 except IOError as err:
64 print("Can't open %s: %s" % (filename, err))
Martin v. Löwisef04c442008-03-19 05:04:44 +000065 return False
66 self.symbol2number = {}
67 self.number2symbol = {}
68 lineno = 0
69 for line in f:
70 lineno += 1
71 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
72 if not mo and line.strip():
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000073 print("%s(%s): can't parse %s" % (filename, lineno,
74 line.strip()))
Martin v. Löwisef04c442008-03-19 05:04:44 +000075 else:
76 symbol, number = mo.groups()
77 number = int(number)
78 assert symbol not in self.symbol2number
79 assert number not in self.number2symbol
80 self.symbol2number[symbol] = number
81 self.number2symbol[number] = symbol
82 return True
83
84 def parse_graminit_c(self, filename):
Ezio Melotti13925002011-03-16 11:05:33 +020085 """Parse the .c file written by pgen. (Internal)
Martin v. Löwisef04c442008-03-19 05:04:44 +000086
87 The file looks as follows. The first two lines are always this:
88
89 #include "pgenheaders.h"
90 #include "grammar.h"
91
92 After that come four blocks:
93
94 1) one or more state definitions
95 2) a table defining dfas
96 3) a table defining labels
97 4) a struct defining the grammar
98
99 A state definition has the following form:
100 - one or more arc arrays, each of the form:
101 static arc arcs_<n>_<m>[<k>] = {
102 {<i>, <j>},
103 ...
104 };
105 - followed by a state array, of the form:
106 static state states_<s>[<t>] = {
107 {<k>, arcs_<n>_<m>},
108 ...
109 };
110
111 """
112 try:
113 f = open(filename)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000114 except IOError as err:
115 print("Can't open %s: %s" % (filename, err))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000116 return False
117 # The code below essentially uses f's iterator-ness!
118 lineno = 0
119
120 # Expect the two #include lines
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000121 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000122 assert line == '#include "pgenheaders.h"\n', (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000123 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000124 assert line == '#include "grammar.h"\n', (lineno, line)
125
126 # Parse the state definitions
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000127 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000128 allarcs = {}
129 states = []
130 while line.startswith("static arc "):
131 while line.startswith("static arc "):
132 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
133 line)
134 assert mo, (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000135 n, m, k = list(map(int, mo.groups()))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000136 arcs = []
137 for _ in range(k):
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000138 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000139 mo = re.match(r"\s+{(\d+), (\d+)},$", line)
140 assert mo, (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000141 i, j = list(map(int, mo.groups()))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000142 arcs.append((i, j))
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000143 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000144 assert line == "};\n", (lineno, line)
145 allarcs[(n, m)] = arcs
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000146 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000147 mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
148 assert mo, (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000149 s, t = list(map(int, mo.groups()))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000150 assert s == len(states), (lineno, line)
151 state = []
152 for _ in range(t):
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000153 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000154 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
155 assert mo, (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000156 k, n, m = list(map(int, mo.groups()))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000157 arcs = allarcs[n, m]
158 assert k == len(arcs), (lineno, line)
159 state.append(arcs)
160 states.append(state)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000161 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000162 assert line == "};\n", (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000163 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000164 self.states = states
165
166 # Parse the dfas
167 dfas = {}
168 mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
169 assert mo, (lineno, line)
170 ndfas = int(mo.group(1))
171 for i in range(ndfas):
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000172 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000173 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
174 line)
175 assert mo, (lineno, line)
176 symbol = mo.group(2)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000177 number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000178 assert self.symbol2number[symbol] == number, (lineno, line)
179 assert self.number2symbol[number] == symbol, (lineno, line)
180 assert x == 0, (lineno, line)
181 state = states[z]
182 assert y == len(state), (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000183 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000184 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
185 assert mo, (lineno, line)
186 first = {}
187 rawbitset = eval(mo.group(1))
188 for i, c in enumerate(rawbitset):
189 byte = ord(c)
190 for j in range(8):
191 if byte & (1<<j):
192 first[i*8 + j] = 1
193 dfas[number] = (state, first)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000194 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000195 assert line == "};\n", (lineno, line)
196 self.dfas = dfas
197
198 # Parse the labels
199 labels = []
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000200 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000201 mo = re.match(r"static label labels\[(\d+)\] = {$", line)
202 assert mo, (lineno, line)
203 nlabels = int(mo.group(1))
204 for i in range(nlabels):
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000205 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000206 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
207 assert mo, (lineno, line)
208 x, y = mo.groups()
209 x = int(x)
210 if y == "0":
211 y = None
212 else:
213 y = eval(y)
214 labels.append((x, y))
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000215 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000216 assert line == "};\n", (lineno, line)
217 self.labels = labels
218
219 # Parse the grammar struct
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000220 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000221 assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000222 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000223 mo = re.match(r"\s+(\d+),$", line)
224 assert mo, (lineno, line)
225 ndfas = int(mo.group(1))
226 assert ndfas == len(self.dfas)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000227 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000228 assert line == "\tdfas,\n", (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000229 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000230 mo = re.match(r"\s+{(\d+), labels},$", line)
231 assert mo, (lineno, line)
232 nlabels = int(mo.group(1))
233 assert nlabels == len(self.labels), (lineno, line)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000234 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000235 mo = re.match(r"\s+(\d+)$", line)
236 assert mo, (lineno, line)
237 start = int(mo.group(1))
238 assert start in self.number2symbol, (lineno, line)
239 self.start = start
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000240 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000241 assert line == "};\n", (lineno, line)
242 try:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000243 lineno, line = lineno+1, next(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000244 except StopIteration:
245 pass
246 else:
247 assert 0, (lineno, line)
248
249 def finish_off(self):
250 """Create additional useful structures. (Internal)."""
251 self.keywords = {} # map from keyword strings to arc labels
252 self.tokens = {} # map from numeric token values to arc labels
253 for ilabel, (type, value) in enumerate(self.labels):
254 if type == token.NAME and value is not None:
255 self.keywords[value] = ilabel
256 elif value is None:
257 self.tokens[type] = ilabel