blob: 03032d4ed8ccf222bc8e2fc5fe116145d2115de1 [file] [log] [blame]
Pablo Galindo71876fa2019-08-22 02:38:39 +01001"""Python parser generator
2
3
4This parser generator transforms a Python grammar file into parsing tables
5that can be consumed by Python's LL(1) parser written in C.
6
7Concepts
8--------
9
10* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
11 top-down parser for a subset of context-free languages. It parses the input
12 from Left to right, performing Leftmost derivation of the sentence, and can
Shashi Ranjan43710b62019-08-24 23:37:24 +053013 only use 1 token of lookahead when parsing a sentence.
Pablo Galindo71876fa2019-08-22 02:38:39 +010014
15* A parsing table is a collection of data that a generic implementation of the
16 LL(1) parser consumes to know how to parse a given context-free grammar. In
Shashi Ranjan43710b62019-08-24 23:37:24 +053017 this case the collection of data involves Deterministic Finite Automatons,
Pablo Galindo71876fa2019-08-22 02:38:39 +010018 calculated first sets, keywords and transition labels.
19
20* A grammar is defined by production rules (or just 'productions') that specify
21 which symbols may replace which other symbols; these rules may be used to
22 generate strings, or to parse them. Each such rule has a head, or left-hand
23 side, which consists of the string that may be replaced, and a body, or
24 right-hand side, which consists of a string that may replace it. In the
25 Python grammar, rules are written in the form
26
27 rule_name: rule_description;
28
Shashi Ranjan43710b62019-08-24 23:37:24 +053029 meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
30 grammar is a grammar in which the left-hand side of each production rule
31 consists of only a single nonterminal symbol. Context-free grammars can
Pablo Galindo71876fa2019-08-22 02:38:39 +010032 always be recognized by a Non-Deterministic Automatons.
33
34* Terminal symbols are literal symbols which may appear in the outputs of the
35 production rules of the grammar and which cannot be changed using the rules
36 of the grammar. Applying the rules recursively to a source string of symbols
37 will usually terminate in a final output string consisting only of terminal
38 symbols.
39
40* Nonterminal symbols are those symbols which can be replaced. The grammar
41 includes a start symbol a designated member of the set of nonterminals from
42 which all the strings in the language may be derived by successive
43 applications of the production rules.
44
45* The language defined by the grammar is defined as the set of terminal strings
46 that can be derived using the production rules.
47
48* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
49 that can appear in the first position of any string derived from the rule.
Shashi Ranjan43710b62019-08-24 23:37:24 +053050 This is useful for LL(1) parsers as the parser is only allowed to look at the
51 next token in the input to know which rule needs to parse. For example, given
Pablo Galindo71876fa2019-08-22 02:38:39 +010052 this grammar:
53
54 start: '(' A | B ')'
55 A: 'a' '<'
56 B: 'b' '<'
57
58 and the input '(b<)' the parser can only look at 'b' to know if it needs
59 to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
60 that needs to continue parsing rule B because only that rule can start
61 with 'b'.
62
63Description
64-----------
65
66The input for the parser generator is a grammar in extended BNF form (using *
67for repetition, + for at-least-once repetition, [] for optional parts, | for
68alternatives and () for grouping).
69
70Each rule in the grammar file is considered as a regular expression in its
71own right. It is turned into a Non-deterministic Finite Automaton (NFA),
72which is then turned into a Deterministic Finite Automaton (DFA), which is
73then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
74or similar compiler books (this technique is more often used for lexical
75analyzers).
76
77The DFA's are used by the parser as parsing tables in a special way that's
78probably unique. Before they are usable, the FIRST sets of all non-terminals
79are computed so the LL(1) parser consuming the parsing tables can distinguish
80between different transitions.
81Reference
82---------
83
84[Aho&Ullman 77]
85 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
86 (first edition)
87"""
88
89from ast import literal_eval
Pablo Galindo1f24a712019-03-01 15:34:44 -080090import collections
Pablo Galindo1f24a712019-03-01 15:34:44 -080091
92from . import grammar, token
Pablo Galindo71876fa2019-08-22 02:38:39 +010093from .automata import DFA
94from .metaparser import GrammarParser
95
96import enum
97
98
99class LabelType(enum.Enum):
100 NONTERMINAL = 0
101 NAMED_TOKEN = 1
102 KEYWORD = 2
103 OPERATOR = 3
104 NONE = 4
105
106
107class Label(str):
108 def __init__(self, value):
109 self.type = self._get_type()
110
111 def _get_type(self):
112 if self[0].isalpha():
113 if self.upper() == self:
114 # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
115 return LabelType.NAMED_TOKEN
116 else:
117 # If is not uppercase it must be a non terminal.
118 return LabelType.NONTERMINAL
119 else:
120 # Keywords and operators are wrapped in quotes
121 assert self[0] == self[-1] in ('"', "'"), self
122 value = literal_eval(self)
123 if value[0].isalpha():
124 return LabelType.KEYWORD
125 else:
126 return LabelType.OPERATOR
127
128 def __repr__(self):
129 return "{}({})".format(self.type, super().__repr__())
Pablo Galindo1f24a712019-03-01 15:34:44 -0800130
Pablo Galindo8bc401a2019-03-04 07:26:13 +0000131
Pablo Galindo1f24a712019-03-01 15:34:44 -0800132class ParserGenerator(object):
Pablo Galindo45cf5db2020-01-14 22:32:55 +0000133 def __init__(self, grammar_file, token_file, verbose=False, graph_file=None):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100134 with open(grammar_file) as f:
135 self.grammar = f.read()
Pablo Galindo1f24a712019-03-01 15:34:44 -0800136 with open(token_file) as tok_file:
137 token_lines = tok_file.readlines()
138 self.tokens = dict(token.generate_tokens(token_lines))
139 self.opmap = dict(token.generate_opmap(token_lines))
140 # Manually add <> so it does not collide with !=
Pablo Galindo71876fa2019-08-22 02:38:39 +0100141 self.opmap["<>"] = "NOTEQUAL"
Pablo Galindo1f24a712019-03-01 15:34:44 -0800142 self.verbose = verbose
143 self.filename = grammar_file
Pablo Galindo45cf5db2020-01-14 22:32:55 +0000144 self.graph_file = graph_file
Pablo Galindo71876fa2019-08-22 02:38:39 +0100145 self.dfas, self.startsymbol = self.create_dfas()
146 self.first = {} # map from symbol name to set of tokens
147 self.calculate_first_sets()
148
149 def create_dfas(self):
150 rule_to_dfas = collections.OrderedDict()
151 start_nonterminal = None
152 for nfa in GrammarParser(self.grammar).parse():
153 if self.verbose:
154 print("Dump of NFA for", nfa.name)
155 nfa.dump()
Pablo Galindo45cf5db2020-01-14 22:32:55 +0000156 if self.graph_file is not None:
157 nfa.dump_graph(self.graph_file.write)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100158 dfa = DFA.from_nfa(nfa)
159 if self.verbose:
160 print("Dump of DFA for", dfa.name)
161 dfa.dump()
162 dfa.simplify()
Pablo Galindo45cf5db2020-01-14 22:32:55 +0000163 if self.graph_file is not None:
164 dfa.dump_graph(self.graph_file.write)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100165 rule_to_dfas[dfa.name] = dfa
166
167 if start_nonterminal is None:
168 start_nonterminal = dfa.name
169
170 return rule_to_dfas, start_nonterminal
Pablo Galindo1f24a712019-03-01 15:34:44 -0800171
172 def make_grammar(self):
173 c = grammar.Grammar()
Pablo Galindo71876fa2019-08-22 02:38:39 +0100174 c.all_labels = set()
Pablo Galindo1f24a712019-03-01 15:34:44 -0800175 names = list(self.dfas.keys())
176 names.remove(self.startsymbol)
177 names.insert(0, self.startsymbol)
178 for name in names:
179 i = 256 + len(c.symbol2number)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100180 c.symbol2number[Label(name)] = i
181 c.number2symbol[i] = Label(name)
182 c.all_labels.add(name)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800183 for name in names:
184 self.make_label(c, name)
185 dfa = self.dfas[name]
186 states = []
187 for state in dfa:
188 arcs = []
189 for label, next in sorted(state.arcs.items()):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100190 c.all_labels.add(label)
191 arcs.append((self.make_label(c, label), dfa.states.index(next)))
192 if state.is_final:
193 arcs.append((0, dfa.states.index(state)))
Pablo Galindo1f24a712019-03-01 15:34:44 -0800194 states.append(arcs)
195 c.states.append(states)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100196 c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
Pablo Galindo1f24a712019-03-01 15:34:44 -0800197 c.start = c.symbol2number[self.startsymbol]
198
199 if self.verbose:
200 print("")
201 print("Grammar summary")
202 print("===============")
203
204 print("- {n_labels} labels".format(n_labels=len(c.labels)))
205 print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
206 print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
207 print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
208 print(
209 "- Start symbol: {start_symbol}".format(
210 start_symbol=c.number2symbol[c.start]
211 )
212 )
213 return c
214
Pablo Galindo71876fa2019-08-22 02:38:39 +0100215 def make_first_sets(self, c, name):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800216 rawfirst = self.first[name]
217 first = set()
218 for label in sorted(rawfirst):
219 ilabel = self.make_label(c, label)
220 ##assert ilabel not in first # XXX failed on <> ... !=
221 first.add(ilabel)
222 return first
223
224 def make_label(self, c, label):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100225 label = Label(label)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800226 ilabel = len(c.labels)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800227
Pablo Galindo71876fa2019-08-22 02:38:39 +0100228 if label.type == LabelType.NONTERMINAL:
229 if label in c.symbol2label:
230 return c.symbol2label[label]
231 else:
232 c.labels.append((c.symbol2number[label], None))
233 c.symbol2label[label] = ilabel
234 return ilabel
235 elif label.type == LabelType.NAMED_TOKEN:
236 # A named token (NAME, NUMBER, STRING)
237 itoken = self.tokens.get(label, None)
238 assert isinstance(itoken, int), label
239 assert itoken in self.tokens.values(), label
240 if itoken in c.tokens:
241 return c.tokens[itoken]
242 else:
243 c.labels.append((itoken, None))
244 c.tokens[itoken] = ilabel
245 return ilabel
246 elif label.type == LabelType.KEYWORD:
247 # A keyword
248 value = literal_eval(label)
249 if value in c.keywords:
250 return c.keywords[value]
251 else:
252 c.labels.append((self.tokens["NAME"], value))
253 c.keywords[value] = ilabel
254 return ilabel
255 elif label.type == LabelType.OPERATOR:
256 # An operator (any non-numeric token)
257 value = literal_eval(label)
258 tok_name = self.opmap[value] # Fails if unknown token
259 itoken = self.tokens[tok_name]
260 if itoken in c.tokens:
261 return c.tokens[itoken]
262 else:
263 c.labels.append((itoken, None))
264 c.tokens[itoken] = ilabel
265 return ilabel
266 else:
267 raise ValueError("Cannot categorize label {}".format(label))
268
269 def calculate_first_sets(self):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800270 names = list(self.dfas.keys())
271 for name in names:
272 if name not in self.first:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100273 self.calculate_first_sets_for_rule(name)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800274
275 if self.verbose:
276 print("First set for {dfa_name}".format(dfa_name=name))
277 for item in self.first[name]:
278 print(" - {terminal}".format(terminal=item))
279
Pablo Galindo71876fa2019-08-22 02:38:39 +0100280 def calculate_first_sets_for_rule(self, name):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800281 dfa = self.dfas[name]
Pablo Galindo71876fa2019-08-22 02:38:39 +0100282 self.first[name] = None # dummy to detect left recursion
283 state = dfa.states[0]
Pablo Galindo1f24a712019-03-01 15:34:44 -0800284 totalset = set()
285 overlapcheck = {}
286 for label, next in state.arcs.items():
287 if label in self.dfas:
288 if label in self.first:
289 fset = self.first[label]
290 if fset is None:
291 raise ValueError("recursion for rule %r" % name)
292 else:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100293 self.calculate_first_sets_for_rule(label)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800294 fset = self.first[label]
295 totalset.update(fset)
296 overlapcheck[label] = fset
297 else:
298 totalset.add(label)
299 overlapcheck[label] = {label}
300 inverse = {}
301 for label, itsfirst in overlapcheck.items():
302 for symbol in itsfirst:
303 if symbol in inverse:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100304 raise ValueError(
305 "rule %s is ambiguous; %s is in the"
306 " first sets of %s as well as %s"
307 % (name, symbol, label, inverse[symbol])
308 )
Pablo Galindo1f24a712019-03-01 15:34:44 -0800309 inverse[symbol] = label
310 self.first[name] = totalset