| """Python parser generator |
| |
| |
| This parser generator transforms a Python grammar file into parsing tables |
| that can be consumed by Python's LL(1) parser written in C. |
| |
| Concepts |
| -------- |
| |
| * An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a |
| top-down parser for a subset of context-free languages. It parses the input |
| from Left to right, performing Leftmost derivation of the sentence, and can |
| only use 1 token of lookahead when parsing a sentence. |
| |
| * A parsing table is a collection of data that a generic implementation of the |
| LL(1) parser consumes to know how to parse a given context-free grammar. In |
| this case the collection of data involves Deterministic Finite Automatons, |
| calculated first sets, keywords and transition labels. |
| |
| * A grammar is defined by production rules (or just 'productions') that specify |
| which symbols may replace which other symbols; these rules may be used to |
| generate strings, or to parse them. Each such rule has a head, or left-hand |
| side, which consists of the string that may be replaced, and a body, or |
| right-hand side, which consists of a string that may replace it. In the |
| Python grammar, rules are written in the form |
| |
| rule_name: rule_description; |
| |
| meaning the rule 'a: b' specifies that a can be replaced by b. A context-free |
| grammar is a grammar in which the left-hand side of each production rule |
| consists of only a single nonterminal symbol. Context-free grammars can |
| always be recognized by a Non-Deterministic Automatons. |
| |
| * Terminal symbols are literal symbols which may appear in the outputs of the |
| production rules of the grammar and which cannot be changed using the rules |
| of the grammar. Applying the rules recursively to a source string of symbols |
| will usually terminate in a final output string consisting only of terminal |
| symbols. |
| |
| * Nonterminal symbols are those symbols which can be replaced. The grammar |
| includes a start symbol a designated member of the set of nonterminals from |
| which all the strings in the language may be derived by successive |
| applications of the production rules. |
| |
| * The language defined by the grammar is defined as the set of terminal strings |
| that can be derived using the production rules. |
| |
| * The first sets of a rule (FIRST(rule)) are defined to be the set of terminals |
| that can appear in the first position of any string derived from the rule. |
| This is useful for LL(1) parsers as the parser is only allowed to look at the |
| next token in the input to know which rule needs to parse. For example, given |
| this grammar: |
| |
| start: '(' A | B ')' |
| A: 'a' '<' |
| B: 'b' '<' |
| |
| and the input '(b<)' the parser can only look at 'b' to know if it needs |
| to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows |
| that needs to continue parsing rule B because only that rule can start |
| with 'b'. |
| |
| Description |
| ----------- |
| |
| The input for the parser generator is a grammar in extended BNF form (using * |
| for repetition, + for at-least-once repetition, [] for optional parts, | for |
| alternatives and () for grouping). |
| |
| Each rule in the grammar file is considered as a regular expression in its |
| own right. It is turned into a Non-deterministic Finite Automaton (NFA), |
| which is then turned into a Deterministic Finite Automaton (DFA), which is |
| then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, |
| or similar compiler books (this technique is more often used for lexical |
| analyzers). |
| |
| The DFA's are used by the parser as parsing tables in a special way that's |
| probably unique. Before they are usable, the FIRST sets of all non-terminals |
| are computed so the LL(1) parser consuming the parsing tables can distinguish |
| between different transitions. |
| Reference |
| --------- |
| |
| [Aho&Ullman 77] |
| Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 |
| (first edition) |
| """ |
| |
| from ast import literal_eval |
| import collections |
| |
| from . import grammar, token |
| from .automata import DFA |
| from .metaparser import GrammarParser |
| |
| import enum |
| |
| |
| class LabelType(enum.Enum): |
| NONTERMINAL = 0 |
| NAMED_TOKEN = 1 |
| KEYWORD = 2 |
| OPERATOR = 3 |
| NONE = 4 |
| |
| |
| class Label(str): |
| def __init__(self, value): |
| self.type = self._get_type() |
| |
| def _get_type(self): |
| if self[0].isalpha(): |
| if self.upper() == self: |
| # NAMED tokens (ASYNC, NAME...) are all uppercase by convention |
| return LabelType.NAMED_TOKEN |
| else: |
| # If is not uppercase it must be a non terminal. |
| return LabelType.NONTERMINAL |
| else: |
| # Keywords and operators are wrapped in quotes |
| assert self[0] == self[-1] in ('"', "'"), self |
| value = literal_eval(self) |
| if value[0].isalpha(): |
| return LabelType.KEYWORD |
| else: |
| return LabelType.OPERATOR |
| |
| def __repr__(self): |
| return "{}({})".format(self.type, super().__repr__()) |
| |
| |
| class ParserGenerator(object): |
| def __init__(self, grammar_file, token_file, verbose=False, graph_file=None): |
| with open(grammar_file) as f: |
| self.grammar = f.read() |
| with open(token_file) as tok_file: |
| token_lines = tok_file.readlines() |
| self.tokens = dict(token.generate_tokens(token_lines)) |
| self.opmap = dict(token.generate_opmap(token_lines)) |
| # Manually add <> so it does not collide with != |
| self.opmap["<>"] = "NOTEQUAL" |
| self.verbose = verbose |
| self.filename = grammar_file |
| self.graph_file = graph_file |
| self.dfas, self.startsymbol = self.create_dfas() |
| self.first = {} # map from symbol name to set of tokens |
| self.calculate_first_sets() |
| |
| def create_dfas(self): |
| rule_to_dfas = collections.OrderedDict() |
| start_nonterminal = None |
| for nfa in GrammarParser(self.grammar).parse(): |
| if self.verbose: |
| print("Dump of NFA for", nfa.name) |
| nfa.dump() |
| if self.graph_file is not None: |
| nfa.dump_graph(self.graph_file.write) |
| dfa = DFA.from_nfa(nfa) |
| if self.verbose: |
| print("Dump of DFA for", dfa.name) |
| dfa.dump() |
| dfa.simplify() |
| if self.graph_file is not None: |
| dfa.dump_graph(self.graph_file.write) |
| rule_to_dfas[dfa.name] = dfa |
| |
| if start_nonterminal is None: |
| start_nonterminal = dfa.name |
| |
| return rule_to_dfas, start_nonterminal |
| |
| def make_grammar(self): |
| c = grammar.Grammar() |
| c.all_labels = set() |
| names = list(self.dfas.keys()) |
| names.remove(self.startsymbol) |
| names.insert(0, self.startsymbol) |
| for name in names: |
| i = 256 + len(c.symbol2number) |
| c.symbol2number[Label(name)] = i |
| c.number2symbol[i] = Label(name) |
| c.all_labels.add(name) |
| for name in names: |
| self.make_label(c, name) |
| dfa = self.dfas[name] |
| states = [] |
| for state in dfa: |
| arcs = [] |
| for label, next in sorted(state.arcs.items()): |
| c.all_labels.add(label) |
| arcs.append((self.make_label(c, label), dfa.states.index(next))) |
| if state.is_final: |
| arcs.append((0, dfa.states.index(state))) |
| states.append(arcs) |
| c.states.append(states) |
| c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) |
| c.start = c.symbol2number[self.startsymbol] |
| |
| if self.verbose: |
| print("") |
| print("Grammar summary") |
| print("===============") |
| |
| print("- {n_labels} labels".format(n_labels=len(c.labels))) |
| print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) |
| print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) |
| print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) |
| print( |
| "- Start symbol: {start_symbol}".format( |
| start_symbol=c.number2symbol[c.start] |
| ) |
| ) |
| return c |
| |
| def make_first_sets(self, c, name): |
| rawfirst = self.first[name] |
| first = set() |
| for label in sorted(rawfirst): |
| ilabel = self.make_label(c, label) |
| ##assert ilabel not in first # XXX failed on <> ... != |
| first.add(ilabel) |
| return first |
| |
| def make_label(self, c, label): |
| label = Label(label) |
| ilabel = len(c.labels) |
| |
| if label.type == LabelType.NONTERMINAL: |
| if label in c.symbol2label: |
| return c.symbol2label[label] |
| else: |
| c.labels.append((c.symbol2number[label], None)) |
| c.symbol2label[label] = ilabel |
| return ilabel |
| elif label.type == LabelType.NAMED_TOKEN: |
| # A named token (NAME, NUMBER, STRING) |
| itoken = self.tokens.get(label, None) |
| assert isinstance(itoken, int), label |
| assert itoken in self.tokens.values(), label |
| if itoken in c.tokens: |
| return c.tokens[itoken] |
| else: |
| c.labels.append((itoken, None)) |
| c.tokens[itoken] = ilabel |
| return ilabel |
| elif label.type == LabelType.KEYWORD: |
| # A keyword |
| value = literal_eval(label) |
| if value in c.keywords: |
| return c.keywords[value] |
| else: |
| c.labels.append((self.tokens["NAME"], value)) |
| c.keywords[value] = ilabel |
| return ilabel |
| elif label.type == LabelType.OPERATOR: |
| # An operator (any non-numeric token) |
| value = literal_eval(label) |
| tok_name = self.opmap[value] # Fails if unknown token |
| itoken = self.tokens[tok_name] |
| if itoken in c.tokens: |
| return c.tokens[itoken] |
| else: |
| c.labels.append((itoken, None)) |
| c.tokens[itoken] = ilabel |
| return ilabel |
| else: |
| raise ValueError("Cannot categorize label {}".format(label)) |
| |
| def calculate_first_sets(self): |
| names = list(self.dfas.keys()) |
| for name in names: |
| if name not in self.first: |
| self.calculate_first_sets_for_rule(name) |
| |
| if self.verbose: |
| print("First set for {dfa_name}".format(dfa_name=name)) |
| for item in self.first[name]: |
| print(" - {terminal}".format(terminal=item)) |
| |
| def calculate_first_sets_for_rule(self, name): |
| dfa = self.dfas[name] |
| self.first[name] = None # dummy to detect left recursion |
| state = dfa.states[0] |
| totalset = set() |
| overlapcheck = {} |
| for label, next in state.arcs.items(): |
| if label in self.dfas: |
| if label in self.first: |
| fset = self.first[label] |
| if fset is None: |
| raise ValueError("recursion for rule %r" % name) |
| else: |
| self.calculate_first_sets_for_rule(label) |
| fset = self.first[label] |
| totalset.update(fset) |
| overlapcheck[label] = fset |
| else: |
| totalset.add(label) |
| overlapcheck[label] = {label} |
| inverse = {} |
| for label, itsfirst in overlapcheck.items(): |
| for symbol in itsfirst: |
| if symbol in inverse: |
| raise ValueError( |
| "rule %s is ambiguous; %s is in the" |
| " first sets of %s as well as %s" |
| % (name, symbol, label, inverse[symbol]) |
| ) |
| inverse[symbol] = label |
| self.first[name] = totalset |