blob: 2f444eb8c86ffecd449442db8712a04d04352cef [file] [log] [blame]
Pablo Galindo71876fa2019-08-22 02:38:39 +01001"""Python parser generator
2
3
4This parser generator transforms a Python grammar file into parsing tables
5that can be consumed by Python's LL(1) parser written in C.
6
7Concepts
8--------
9
10* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
11 top-down parser for a subset of context-free languages. It parses the input
12 from Left to right, performing Leftmost derivation of the sentence, and can
Shashi Ranjan43710b62019-08-24 23:37:24 +053013 only use 1 token of lookahead when parsing a sentence.
Pablo Galindo71876fa2019-08-22 02:38:39 +010014
15* A parsing table is a collection of data that a generic implementation of the
16 LL(1) parser consumes to know how to parse a given context-free grammar. In
Shashi Ranjan43710b62019-08-24 23:37:24 +053017 this case the collection of data involves Deterministic Finite Automatons,
Pablo Galindo71876fa2019-08-22 02:38:39 +010018 calculated first sets, keywords and transition labels.
19
20* A grammar is defined by production rules (or just 'productions') that specify
21 which symbols may replace which other symbols; these rules may be used to
22 generate strings, or to parse them. Each such rule has a head, or left-hand
23 side, which consists of the string that may be replaced, and a body, or
24 right-hand side, which consists of a string that may replace it. In the
25 Python grammar, rules are written in the form
26
27 rule_name: rule_description;
28
Shashi Ranjan43710b62019-08-24 23:37:24 +053029 meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
30 grammar is a grammar in which the left-hand side of each production rule
31 consists of only a single nonterminal symbol. Context-free grammars can
Pablo Galindo71876fa2019-08-22 02:38:39 +010032 always be recognized by a Non-Deterministic Automatons.
33
34* Terminal symbols are literal symbols which may appear in the outputs of the
35 production rules of the grammar and which cannot be changed using the rules
36 of the grammar. Applying the rules recursively to a source string of symbols
37 will usually terminate in a final output string consisting only of terminal
38 symbols.
39
40* Nonterminal symbols are those symbols which can be replaced. The grammar
41 includes a start symbol a designated member of the set of nonterminals from
42 which all the strings in the language may be derived by successive
43 applications of the production rules.
44
45* The language defined by the grammar is defined as the set of terminal strings
46 that can be derived using the production rules.
47
48* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
49 that can appear in the first position of any string derived from the rule.
Shashi Ranjan43710b62019-08-24 23:37:24 +053050 This is useful for LL(1) parsers as the parser is only allowed to look at the
51 next token in the input to know which rule needs to parse. For example, given
Pablo Galindo71876fa2019-08-22 02:38:39 +010052 this grammar:
53
54 start: '(' A | B ')'
55 A: 'a' '<'
56 B: 'b' '<'
57
58 and the input '(b<)' the parser can only look at 'b' to know if it needs
59 to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
60 that needs to continue parsing rule B because only that rule can start
61 with 'b'.
62
63Description
64-----------
65
66The input for the parser generator is a grammar in extended BNF form (using *
67for repetition, + for at-least-once repetition, [] for optional parts, | for
68alternatives and () for grouping).
69
70Each rule in the grammar file is considered as a regular expression in its
71own right. It is turned into a Non-deterministic Finite Automaton (NFA),
72which is then turned into a Deterministic Finite Automaton (DFA), which is
73then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
74or similar compiler books (this technique is more often used for lexical
75analyzers).
76
77The DFA's are used by the parser as parsing tables in a special way that's
78probably unique. Before they are usable, the FIRST sets of all non-terminals
79are computed so the LL(1) parser consuming the parsing tables can distinguish
80between different transitions.
81Reference
82---------
83
84[Aho&Ullman 77]
85 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
86 (first edition)
87"""
88
89from ast import literal_eval
Pablo Galindo1f24a712019-03-01 15:34:44 -080090import collections
Pablo Galindo1f24a712019-03-01 15:34:44 -080091
92from . import grammar, token
Pablo Galindo71876fa2019-08-22 02:38:39 +010093from .automata import DFA
94from .metaparser import GrammarParser
95
96import enum
97
98
99class LabelType(enum.Enum):
100 NONTERMINAL = 0
101 NAMED_TOKEN = 1
102 KEYWORD = 2
103 OPERATOR = 3
104 NONE = 4
105
106
107class Label(str):
108 def __init__(self, value):
109 self.type = self._get_type()
110
111 def _get_type(self):
112 if self[0].isalpha():
113 if self.upper() == self:
114 # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
115 return LabelType.NAMED_TOKEN
116 else:
117 # If is not uppercase it must be a non terminal.
118 return LabelType.NONTERMINAL
119 else:
120 # Keywords and operators are wrapped in quotes
121 assert self[0] == self[-1] in ('"', "'"), self
122 value = literal_eval(self)
123 if value[0].isalpha():
124 return LabelType.KEYWORD
125 else:
126 return LabelType.OPERATOR
127
128 def __repr__(self):
129 return "{}({})".format(self.type, super().__repr__())
Pablo Galindo1f24a712019-03-01 15:34:44 -0800130
Pablo Galindo8bc401a2019-03-04 07:26:13 +0000131
Pablo Galindo1f24a712019-03-01 15:34:44 -0800132class ParserGenerator(object):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100133 def __init__(self, grammar_file, token_file, verbose=False):
134 with open(grammar_file) as f:
135 self.grammar = f.read()
Pablo Galindo1f24a712019-03-01 15:34:44 -0800136 with open(token_file) as tok_file:
137 token_lines = tok_file.readlines()
138 self.tokens = dict(token.generate_tokens(token_lines))
139 self.opmap = dict(token.generate_opmap(token_lines))
140 # Manually add <> so it does not collide with !=
Pablo Galindo71876fa2019-08-22 02:38:39 +0100141 self.opmap["<>"] = "NOTEQUAL"
Pablo Galindo1f24a712019-03-01 15:34:44 -0800142 self.verbose = verbose
143 self.filename = grammar_file
Pablo Galindo71876fa2019-08-22 02:38:39 +0100144 self.dfas, self.startsymbol = self.create_dfas()
145 self.first = {} # map from symbol name to set of tokens
146 self.calculate_first_sets()
147
148 def create_dfas(self):
149 rule_to_dfas = collections.OrderedDict()
150 start_nonterminal = None
151 for nfa in GrammarParser(self.grammar).parse():
152 if self.verbose:
153 print("Dump of NFA for", nfa.name)
154 nfa.dump()
155 dfa = DFA.from_nfa(nfa)
156 if self.verbose:
157 print("Dump of DFA for", dfa.name)
158 dfa.dump()
159 dfa.simplify()
160 rule_to_dfas[dfa.name] = dfa
161
162 if start_nonterminal is None:
163 start_nonterminal = dfa.name
164
165 return rule_to_dfas, start_nonterminal
Pablo Galindo1f24a712019-03-01 15:34:44 -0800166
167 def make_grammar(self):
168 c = grammar.Grammar()
Pablo Galindo71876fa2019-08-22 02:38:39 +0100169 c.all_labels = set()
Pablo Galindo1f24a712019-03-01 15:34:44 -0800170 names = list(self.dfas.keys())
171 names.remove(self.startsymbol)
172 names.insert(0, self.startsymbol)
173 for name in names:
174 i = 256 + len(c.symbol2number)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100175 c.symbol2number[Label(name)] = i
176 c.number2symbol[i] = Label(name)
177 c.all_labels.add(name)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800178 for name in names:
179 self.make_label(c, name)
180 dfa = self.dfas[name]
181 states = []
182 for state in dfa:
183 arcs = []
184 for label, next in sorted(state.arcs.items()):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100185 c.all_labels.add(label)
186 arcs.append((self.make_label(c, label), dfa.states.index(next)))
187 if state.is_final:
188 arcs.append((0, dfa.states.index(state)))
Pablo Galindo1f24a712019-03-01 15:34:44 -0800189 states.append(arcs)
190 c.states.append(states)
Pablo Galindo71876fa2019-08-22 02:38:39 +0100191 c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
Pablo Galindo1f24a712019-03-01 15:34:44 -0800192 c.start = c.symbol2number[self.startsymbol]
193
194 if self.verbose:
195 print("")
196 print("Grammar summary")
197 print("===============")
198
199 print("- {n_labels} labels".format(n_labels=len(c.labels)))
200 print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
201 print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
202 print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
203 print(
204 "- Start symbol: {start_symbol}".format(
205 start_symbol=c.number2symbol[c.start]
206 )
207 )
208 return c
209
Pablo Galindo71876fa2019-08-22 02:38:39 +0100210 def make_first_sets(self, c, name):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800211 rawfirst = self.first[name]
212 first = set()
213 for label in sorted(rawfirst):
214 ilabel = self.make_label(c, label)
215 ##assert ilabel not in first # XXX failed on <> ... !=
216 first.add(ilabel)
217 return first
218
219 def make_label(self, c, label):
Pablo Galindo71876fa2019-08-22 02:38:39 +0100220 label = Label(label)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800221 ilabel = len(c.labels)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800222
Pablo Galindo71876fa2019-08-22 02:38:39 +0100223 if label.type == LabelType.NONTERMINAL:
224 if label in c.symbol2label:
225 return c.symbol2label[label]
226 else:
227 c.labels.append((c.symbol2number[label], None))
228 c.symbol2label[label] = ilabel
229 return ilabel
230 elif label.type == LabelType.NAMED_TOKEN:
231 # A named token (NAME, NUMBER, STRING)
232 itoken = self.tokens.get(label, None)
233 assert isinstance(itoken, int), label
234 assert itoken in self.tokens.values(), label
235 if itoken in c.tokens:
236 return c.tokens[itoken]
237 else:
238 c.labels.append((itoken, None))
239 c.tokens[itoken] = ilabel
240 return ilabel
241 elif label.type == LabelType.KEYWORD:
242 # A keyword
243 value = literal_eval(label)
244 if value in c.keywords:
245 return c.keywords[value]
246 else:
247 c.labels.append((self.tokens["NAME"], value))
248 c.keywords[value] = ilabel
249 return ilabel
250 elif label.type == LabelType.OPERATOR:
251 # An operator (any non-numeric token)
252 value = literal_eval(label)
253 tok_name = self.opmap[value] # Fails if unknown token
254 itoken = self.tokens[tok_name]
255 if itoken in c.tokens:
256 return c.tokens[itoken]
257 else:
258 c.labels.append((itoken, None))
259 c.tokens[itoken] = ilabel
260 return ilabel
261 else:
262 raise ValueError("Cannot categorize label {}".format(label))
263
264 def calculate_first_sets(self):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800265 names = list(self.dfas.keys())
266 for name in names:
267 if name not in self.first:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100268 self.calculate_first_sets_for_rule(name)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800269
270 if self.verbose:
271 print("First set for {dfa_name}".format(dfa_name=name))
272 for item in self.first[name]:
273 print(" - {terminal}".format(terminal=item))
274
Pablo Galindo71876fa2019-08-22 02:38:39 +0100275 def calculate_first_sets_for_rule(self, name):
Pablo Galindo1f24a712019-03-01 15:34:44 -0800276 dfa = self.dfas[name]
Pablo Galindo71876fa2019-08-22 02:38:39 +0100277 self.first[name] = None # dummy to detect left recursion
278 state = dfa.states[0]
Pablo Galindo1f24a712019-03-01 15:34:44 -0800279 totalset = set()
280 overlapcheck = {}
281 for label, next in state.arcs.items():
282 if label in self.dfas:
283 if label in self.first:
284 fset = self.first[label]
285 if fset is None:
286 raise ValueError("recursion for rule %r" % name)
287 else:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100288 self.calculate_first_sets_for_rule(label)
Pablo Galindo1f24a712019-03-01 15:34:44 -0800289 fset = self.first[label]
290 totalset.update(fset)
291 overlapcheck[label] = fset
292 else:
293 totalset.add(label)
294 overlapcheck[label] = {label}
295 inverse = {}
296 for label, itsfirst in overlapcheck.items():
297 for symbol in itsfirst:
298 if symbol in inverse:
Pablo Galindo71876fa2019-08-22 02:38:39 +0100299 raise ValueError(
300 "rule %s is ambiguous; %s is in the"
301 " first sets of %s as well as %s"
302 % (name, symbol, label, inverse[symbol])
303 )
Pablo Galindo1f24a712019-03-01 15:34:44 -0800304 inverse[symbol] = label
305 self.first[name] = totalset