| import collections |
| import tokenize # from stdlib |
| |
| from . import grammar, token |
| |
| |
| class ParserGenerator(object): |
| |
| def __init__(self, grammar_file, token_file, stream=None, verbose=False): |
| close_stream = None |
| if stream is None: |
| stream = open(grammar_file) |
| close_stream = stream.close |
| with open(token_file) as tok_file: |
| token_lines = tok_file.readlines() |
| self.tokens = dict(token.generate_tokens(token_lines)) |
| self.opmap = dict(token.generate_opmap(token_lines)) |
| # Manually add <> so it does not collide with != |
| self.opmap['<>'] = "NOTEQUAL" |
| self.verbose = verbose |
| self.filename = grammar_file |
| self.stream = stream |
| self.generator = tokenize.generate_tokens(stream.readline) |
| self.gettoken() # Initialize lookahead |
| self.dfas, self.startsymbol = self.parse() |
| if close_stream is not None: |
| close_stream() |
| self.first = {} # map from symbol name to set of tokens |
| self.addfirstsets() |
| |
| def make_grammar(self): |
| c = grammar.Grammar() |
| names = list(self.dfas.keys()) |
| names.remove(self.startsymbol) |
| names.insert(0, self.startsymbol) |
| for name in names: |
| i = 256 + len(c.symbol2number) |
| c.symbol2number[name] = i |
| c.number2symbol[i] = name |
| for name in names: |
| self.make_label(c, name) |
| dfa = self.dfas[name] |
| states = [] |
| for state in dfa: |
| arcs = [] |
| for label, next in sorted(state.arcs.items()): |
| arcs.append((self.make_label(c, label), dfa.index(next))) |
| if state.isfinal: |
| arcs.append((0, dfa.index(state))) |
| states.append(arcs) |
| c.states.append(states) |
| c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) |
| c.start = c.symbol2number[self.startsymbol] |
| |
| if self.verbose: |
| print("") |
| print("Grammar summary") |
| print("===============") |
| |
| print("- {n_labels} labels".format(n_labels=len(c.labels))) |
| print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) |
| print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) |
| print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) |
| print( |
| "- Start symbol: {start_symbol}".format( |
| start_symbol=c.number2symbol[c.start] |
| ) |
| ) |
| return c |
| |
| def make_first(self, c, name): |
| rawfirst = self.first[name] |
| first = set() |
| for label in sorted(rawfirst): |
| ilabel = self.make_label(c, label) |
| ##assert ilabel not in first # XXX failed on <> ... != |
| first.add(ilabel) |
| return first |
| |
| def make_label(self, c, label): |
| # XXX Maybe this should be a method on a subclass of converter? |
| ilabel = len(c.labels) |
| if label[0].isalpha(): |
| # Either a symbol name or a named token |
| if label in c.symbol2number: |
| # A symbol name (a non-terminal) |
| if label in c.symbol2label: |
| return c.symbol2label[label] |
| else: |
| c.labels.append((c.symbol2number[label], None)) |
| c.symbol2label[label] = ilabel |
| return ilabel |
| else: |
| # A named token (NAME, NUMBER, STRING) |
| itoken = self.tokens.get(label, None) |
| assert isinstance(itoken, int), label |
| assert itoken in self.tokens.values(), label |
| if itoken in c.tokens: |
| return c.tokens[itoken] |
| else: |
| c.labels.append((itoken, None)) |
| c.tokens[itoken] = ilabel |
| return ilabel |
| else: |
| # Either a keyword or an operator |
| assert label[0] in ('"', "'"), label |
| value = eval(label) |
| if value[0].isalpha(): |
| # A keyword |
| if value in c.keywords: |
| return c.keywords[value] |
| else: |
| c.labels.append((self.tokens["NAME"], value)) |
| c.keywords[value] = ilabel |
| return ilabel |
| else: |
| # An operator (any non-numeric token) |
| tok_name = self.opmap[value] # Fails if unknown token |
| itoken = self.tokens[tok_name] |
| if itoken in c.tokens: |
| return c.tokens[itoken] |
| else: |
| c.labels.append((itoken, None)) |
| c.tokens[itoken] = ilabel |
| return ilabel |
| |
| def addfirstsets(self): |
| names = list(self.dfas.keys()) |
| for name in names: |
| if name not in self.first: |
| self.calcfirst(name) |
| |
| if self.verbose: |
| print("First set for {dfa_name}".format(dfa_name=name)) |
| for item in self.first[name]: |
| print(" - {terminal}".format(terminal=item)) |
| |
| def calcfirst(self, name): |
| dfa = self.dfas[name] |
| self.first[name] = None # dummy to detect left recursion |
| state = dfa[0] |
| totalset = set() |
| overlapcheck = {} |
| for label, next in state.arcs.items(): |
| if label in self.dfas: |
| if label in self.first: |
| fset = self.first[label] |
| if fset is None: |
| raise ValueError("recursion for rule %r" % name) |
| else: |
| self.calcfirst(label) |
| fset = self.first[label] |
| totalset.update(fset) |
| overlapcheck[label] = fset |
| else: |
| totalset.add(label) |
| overlapcheck[label] = {label} |
| inverse = {} |
| for label, itsfirst in overlapcheck.items(): |
| for symbol in itsfirst: |
| if symbol in inverse: |
| raise ValueError("rule %s is ambiguous; %s is in the" |
| " first sets of %s as well as %s" % |
| (name, symbol, label, inverse[symbol])) |
| inverse[symbol] = label |
| self.first[name] = totalset |
| |
| def parse(self): |
| dfas = collections.OrderedDict() |
| startsymbol = None |
| # MSTART: (NEWLINE | RULE)* ENDMARKER |
| while self.type != tokenize.ENDMARKER: |
| while self.type == tokenize.NEWLINE: |
| self.gettoken() |
| # RULE: NAME ':' RHS NEWLINE |
| name = self.expect(tokenize.NAME) |
| if self.verbose: |
| print("Processing rule {dfa_name}".format(dfa_name=name)) |
| self.expect(tokenize.OP, ":") |
| a, z = self.parse_rhs() |
| self.expect(tokenize.NEWLINE) |
| if self.verbose: |
| self.dump_nfa(name, a, z) |
| dfa = self.make_dfa(a, z) |
| if self.verbose: |
| self.dump_dfa(name, dfa) |
| self.simplify_dfa(dfa) |
| dfas[name] = dfa |
| if startsymbol is None: |
| startsymbol = name |
| return dfas, startsymbol |
| |
| def make_dfa(self, start, finish): |
| # To turn an NFA into a DFA, we define the states of the DFA |
| # to correspond to *sets* of states of the NFA. Then do some |
| # state reduction. Let's represent sets as dicts with 1 for |
| # values. |
| assert isinstance(start, NFAState) |
| assert isinstance(finish, NFAState) |
| def closure(state): |
| base = set() |
| addclosure(state, base) |
| return base |
| def addclosure(state, base): |
| assert isinstance(state, NFAState) |
| if state in base: |
| return |
| base.add(state) |
| for label, next in state.arcs: |
| if label is None: |
| addclosure(next, base) |
| states = [DFAState(closure(start), finish)] |
| for state in states: # NB states grows while we're iterating |
| arcs = {} |
| for nfastate in state.nfaset: |
| for label, next in nfastate.arcs: |
| if label is not None: |
| addclosure(next, arcs.setdefault(label, set())) |
| for label, nfaset in sorted(arcs.items()): |
| for st in states: |
| if st.nfaset == nfaset: |
| break |
| else: |
| st = DFAState(nfaset, finish) |
| states.append(st) |
| state.addarc(st, label) |
| return states # List of DFAState instances; first one is start |
| |
| def dump_nfa(self, name, start, finish): |
| print("Dump of NFA for", name) |
| todo = [start] |
| for i, state in enumerate(todo): |
| print(" State", i, state is finish and "(final)" or "") |
| for label, next in state.arcs: |
| if next in todo: |
| j = todo.index(next) |
| else: |
| j = len(todo) |
| todo.append(next) |
| if label is None: |
| print(" -> %d" % j) |
| else: |
| print(" %s -> %d" % (label, j)) |
| |
| def dump_dfa(self, name, dfa): |
| print("Dump of DFA for", name) |
| for i, state in enumerate(dfa): |
| print(" State", i, state.isfinal and "(final)" or "") |
| for label, next in sorted(state.arcs.items()): |
| print(" %s -> %d" % (label, dfa.index(next))) |
| |
| def simplify_dfa(self, dfa): |
| # This is not theoretically optimal, but works well enough. |
| # Algorithm: repeatedly look for two states that have the same |
| # set of arcs (same labels pointing to the same nodes) and |
| # unify them, until things stop changing. |
| |
| # dfa is a list of DFAState instances |
| changes = True |
| while changes: |
| changes = False |
| for i, state_i in enumerate(dfa): |
| for j in range(i+1, len(dfa)): |
| state_j = dfa[j] |
| if state_i == state_j: |
| #print " unify", i, j |
| del dfa[j] |
| for state in dfa: |
| state.unifystate(state_j, state_i) |
| changes = True |
| break |
| |
| def parse_rhs(self): |
| # RHS: ALT ('|' ALT)* |
| a, z = self.parse_alt() |
| if self.value != "|": |
| return a, z |
| else: |
| aa = NFAState() |
| zz = NFAState() |
| aa.addarc(a) |
| z.addarc(zz) |
| while self.value == "|": |
| self.gettoken() |
| a, z = self.parse_alt() |
| aa.addarc(a) |
| z.addarc(zz) |
| return aa, zz |
| |
| def parse_alt(self): |
| # ALT: ITEM+ |
| a, b = self.parse_item() |
| while (self.value in ("(", "[") or |
| self.type in (tokenize.NAME, tokenize.STRING)): |
| c, d = self.parse_item() |
| b.addarc(c) |
| b = d |
| return a, b |
| |
| def parse_item(self): |
| # ITEM: '[' RHS ']' | ATOM ['+' | '*'] |
| if self.value == "[": |
| self.gettoken() |
| a, z = self.parse_rhs() |
| self.expect(tokenize.OP, "]") |
| a.addarc(z) |
| return a, z |
| else: |
| a, z = self.parse_atom() |
| value = self.value |
| if value not in ("+", "*"): |
| return a, z |
| self.gettoken() |
| z.addarc(a) |
| if value == "+": |
| return a, z |
| else: |
| return a, a |
| |
| def parse_atom(self): |
| # ATOM: '(' RHS ')' | NAME | STRING |
| if self.value == "(": |
| self.gettoken() |
| a, z = self.parse_rhs() |
| self.expect(tokenize.OP, ")") |
| return a, z |
| elif self.type in (tokenize.NAME, tokenize.STRING): |
| a = NFAState() |
| z = NFAState() |
| a.addarc(z, self.value) |
| self.gettoken() |
| return a, z |
| else: |
| self.raise_error("expected (...) or NAME or STRING, got %s/%s", |
| self.type, self.value) |
| |
| def expect(self, type, value=None): |
| if self.type != type or (value is not None and self.value != value): |
| self.raise_error("expected %s/%s, got %s/%s", |
| type, value, self.type, self.value) |
| value = self.value |
| self.gettoken() |
| return value |
| |
| def gettoken(self): |
| tup = next(self.generator) |
| while tup[0] in (tokenize.COMMENT, tokenize.NL): |
| tup = next(self.generator) |
| self.type, self.value, self.begin, self.end, self.line = tup |
| # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value)) |
| |
| def raise_error(self, msg, *args): |
| if args: |
| try: |
| msg = msg % args |
| except Exception: |
| msg = " ".join([msg] + list(map(str, args))) |
| raise SyntaxError(msg, (self.filename, self.end[0], |
| self.end[1], self.line)) |
| |
| class NFAState(object): |
| |
| def __init__(self): |
| self.arcs = [] # list of (label, NFAState) pairs |
| |
| def addarc(self, next, label=None): |
| assert label is None or isinstance(label, str) |
| assert isinstance(next, NFAState) |
| self.arcs.append((label, next)) |
| |
| class DFAState(object): |
| |
| def __init__(self, nfaset, final): |
| assert isinstance(nfaset, set) |
| assert isinstance(next(iter(nfaset)), NFAState) |
| assert isinstance(final, NFAState) |
| self.nfaset = nfaset |
| self.isfinal = final in nfaset |
| self.arcs = {} # map from label to DFAState |
| |
| def addarc(self, next, label): |
| assert isinstance(label, str) |
| assert label not in self.arcs |
| assert isinstance(next, DFAState) |
| self.arcs[label] = next |
| |
| def unifystate(self, old, new): |
| for label, next in self.arcs.items(): |
| if next is old: |
| self.arcs[label] = new |
| |
| def __eq__(self, other): |
| # Equality test -- ignore the nfaset instance variable |
| assert isinstance(other, DFAState) |
| if self.isfinal != other.isfinal: |
| return False |
| # Can't just return self.arcs == other.arcs, because that |
| # would invoke this method recursively, with cycles... |
| if len(self.arcs) != len(other.arcs): |
| return False |
| for label, next in self.arcs.items(): |
| if next is not other.arcs.get(label): |
| return False |
| return True |
| |
| __hash__ = None # For Py3 compatibility. |