Pablo Galindo | 1f24a71 | 2019-03-01 15:34:44 -0800 | [diff] [blame^] | 1 | import collections |
| 2 | import tokenize # from stdlib |
| 3 | |
| 4 | from . import grammar, token |
| 5 | |
| 6 | class ParserGenerator(object): |
| 7 | |
| 8 | def __init__(self, grammar_file, token_file, stream=None, verbose=False): |
| 9 | close_stream = None |
| 10 | if stream is None: |
| 11 | stream = open(grammar_file) |
| 12 | close_stream = stream.close |
| 13 | with open(token_file) as tok_file: |
| 14 | token_lines = tok_file.readlines() |
| 15 | self.tokens = dict(token.generate_tokens(token_lines)) |
| 16 | self.opmap = dict(token.generate_opmap(token_lines)) |
| 17 | # Manually add <> so it does not collide with != |
| 18 | self.opmap['<>'] = "NOTEQUAL" |
| 19 | self.verbose = verbose |
| 20 | self.filename = grammar_file |
| 21 | self.stream = stream |
| 22 | self.generator = tokenize.generate_tokens(stream.readline) |
| 23 | self.gettoken() # Initialize lookahead |
| 24 | self.dfas, self.startsymbol = self.parse() |
| 25 | if close_stream is not None: |
| 26 | close_stream() |
| 27 | self.first = {} # map from symbol name to set of tokens |
| 28 | self.addfirstsets() |
| 29 | |
| 30 | def make_grammar(self): |
| 31 | c = grammar.Grammar() |
| 32 | names = list(self.dfas.keys()) |
| 33 | names.remove(self.startsymbol) |
| 34 | names.insert(0, self.startsymbol) |
| 35 | for name in names: |
| 36 | i = 256 + len(c.symbol2number) |
| 37 | c.symbol2number[name] = i |
| 38 | c.number2symbol[i] = name |
| 39 | for name in names: |
| 40 | self.make_label(c, name) |
| 41 | dfa = self.dfas[name] |
| 42 | states = [] |
| 43 | for state in dfa: |
| 44 | arcs = [] |
| 45 | for label, next in sorted(state.arcs.items()): |
| 46 | arcs.append((self.make_label(c, label), dfa.index(next))) |
| 47 | if state.isfinal: |
| 48 | arcs.append((0, dfa.index(state))) |
| 49 | states.append(arcs) |
| 50 | c.states.append(states) |
| 51 | c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) |
| 52 | c.start = c.symbol2number[self.startsymbol] |
| 53 | |
| 54 | if self.verbose: |
| 55 | print("") |
| 56 | print("Grammar summary") |
| 57 | print("===============") |
| 58 | |
| 59 | print("- {n_labels} labels".format(n_labels=len(c.labels))) |
| 60 | print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) |
| 61 | print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) |
| 62 | print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) |
| 63 | print( |
| 64 | "- Start symbol: {start_symbol}".format( |
| 65 | start_symbol=c.number2symbol[c.start] |
| 66 | ) |
| 67 | ) |
| 68 | return c |
| 69 | |
| 70 | def make_first(self, c, name): |
| 71 | rawfirst = self.first[name] |
| 72 | first = set() |
| 73 | for label in sorted(rawfirst): |
| 74 | ilabel = self.make_label(c, label) |
| 75 | ##assert ilabel not in first # XXX failed on <> ... != |
| 76 | first.add(ilabel) |
| 77 | return first |
| 78 | |
| 79 | def make_label(self, c, label): |
| 80 | # XXX Maybe this should be a method on a subclass of converter? |
| 81 | ilabel = len(c.labels) |
| 82 | if label[0].isalpha(): |
| 83 | # Either a symbol name or a named token |
| 84 | if label in c.symbol2number: |
| 85 | # A symbol name (a non-terminal) |
| 86 | if label in c.symbol2label: |
| 87 | return c.symbol2label[label] |
| 88 | else: |
| 89 | c.labels.append((c.symbol2number[label], None)) |
| 90 | c.symbol2label[label] = ilabel |
| 91 | return ilabel |
| 92 | else: |
| 93 | # A named token (NAME, NUMBER, STRING) |
| 94 | itoken = self.tokens.get(label, None) |
| 95 | assert isinstance(itoken, int), label |
| 96 | assert itoken in self.tokens.values(), label |
| 97 | if itoken in c.tokens: |
| 98 | return c.tokens[itoken] |
| 99 | else: |
| 100 | c.labels.append((itoken, None)) |
| 101 | c.tokens[itoken] = ilabel |
| 102 | return ilabel |
| 103 | else: |
| 104 | # Either a keyword or an operator |
| 105 | assert label[0] in ('"', "'"), label |
| 106 | value = eval(label) |
| 107 | if value[0].isalpha(): |
| 108 | # A keyword |
| 109 | if value in c.keywords: |
| 110 | return c.keywords[value] |
| 111 | else: |
| 112 | c.labels.append((self.tokens["NAME"], value)) |
| 113 | c.keywords[value] = ilabel |
| 114 | return ilabel |
| 115 | else: |
| 116 | # An operator (any non-numeric token) |
| 117 | tok_name = self.opmap[value] # Fails if unknown token |
| 118 | itoken = self.tokens[tok_name] |
| 119 | if itoken in c.tokens: |
| 120 | return c.tokens[itoken] |
| 121 | else: |
| 122 | c.labels.append((itoken, None)) |
| 123 | c.tokens[itoken] = ilabel |
| 124 | return ilabel |
| 125 | |
| 126 | def addfirstsets(self): |
| 127 | names = list(self.dfas.keys()) |
| 128 | for name in names: |
| 129 | if name not in self.first: |
| 130 | self.calcfirst(name) |
| 131 | |
| 132 | if self.verbose: |
| 133 | print("First set for {dfa_name}".format(dfa_name=name)) |
| 134 | for item in self.first[name]: |
| 135 | print(" - {terminal}".format(terminal=item)) |
| 136 | |
| 137 | def calcfirst(self, name): |
| 138 | dfa = self.dfas[name] |
| 139 | self.first[name] = None # dummy to detect left recursion |
| 140 | state = dfa[0] |
| 141 | totalset = set() |
| 142 | overlapcheck = {} |
| 143 | for label, next in state.arcs.items(): |
| 144 | if label in self.dfas: |
| 145 | if label in self.first: |
| 146 | fset = self.first[label] |
| 147 | if fset is None: |
| 148 | raise ValueError("recursion for rule %r" % name) |
| 149 | else: |
| 150 | self.calcfirst(label) |
| 151 | fset = self.first[label] |
| 152 | totalset.update(fset) |
| 153 | overlapcheck[label] = fset |
| 154 | else: |
| 155 | totalset.add(label) |
| 156 | overlapcheck[label] = {label} |
| 157 | inverse = {} |
| 158 | for label, itsfirst in overlapcheck.items(): |
| 159 | for symbol in itsfirst: |
| 160 | if symbol in inverse: |
| 161 | raise ValueError("rule %s is ambiguous; %s is in the" |
| 162 | " first sets of %s as well as %s" % |
| 163 | (name, symbol, label, inverse[symbol])) |
| 164 | inverse[symbol] = label |
| 165 | self.first[name] = totalset |
| 166 | |
| 167 | def parse(self): |
| 168 | dfas = collections.OrderedDict() |
| 169 | startsymbol = None |
| 170 | # MSTART: (NEWLINE | RULE)* ENDMARKER |
| 171 | while self.type != tokenize.ENDMARKER: |
| 172 | while self.type == tokenize.NEWLINE: |
| 173 | self.gettoken() |
| 174 | # RULE: NAME ':' RHS NEWLINE |
| 175 | name = self.expect(tokenize.NAME) |
| 176 | if self.verbose: |
| 177 | print("Processing rule {dfa_name}".format(dfa_name=name)) |
| 178 | self.expect(tokenize.OP, ":") |
| 179 | a, z = self.parse_rhs() |
| 180 | self.expect(tokenize.NEWLINE) |
| 181 | if self.verbose: |
| 182 | self.dump_nfa(name, a, z) |
| 183 | dfa = self.make_dfa(a, z) |
| 184 | if self.verbose: |
| 185 | self.dump_dfa(name, dfa) |
| 186 | oldlen = len(dfa) |
| 187 | self.simplify_dfa(dfa) |
| 188 | newlen = len(dfa) |
| 189 | dfas[name] = dfa |
| 190 | #print name, oldlen, newlen |
| 191 | if startsymbol is None: |
| 192 | startsymbol = name |
| 193 | return dfas, startsymbol |
| 194 | |
| 195 | def make_dfa(self, start, finish): |
| 196 | # To turn an NFA into a DFA, we define the states of the DFA |
| 197 | # to correspond to *sets* of states of the NFA. Then do some |
| 198 | # state reduction. Let's represent sets as dicts with 1 for |
| 199 | # values. |
| 200 | assert isinstance(start, NFAState) |
| 201 | assert isinstance(finish, NFAState) |
| 202 | def closure(state): |
| 203 | base = set() |
| 204 | addclosure(state, base) |
| 205 | return base |
| 206 | def addclosure(state, base): |
| 207 | assert isinstance(state, NFAState) |
| 208 | if state in base: |
| 209 | return |
| 210 | base.add(state) |
| 211 | for label, next in state.arcs: |
| 212 | if label is None: |
| 213 | addclosure(next, base) |
| 214 | states = [DFAState(closure(start), finish)] |
| 215 | for state in states: # NB states grows while we're iterating |
| 216 | arcs = {} |
| 217 | for nfastate in state.nfaset: |
| 218 | for label, next in nfastate.arcs: |
| 219 | if label is not None: |
| 220 | addclosure(next, arcs.setdefault(label, set())) |
| 221 | for label, nfaset in sorted(arcs.items()): |
| 222 | for st in states: |
| 223 | if st.nfaset == nfaset: |
| 224 | break |
| 225 | else: |
| 226 | st = DFAState(nfaset, finish) |
| 227 | states.append(st) |
| 228 | state.addarc(st, label) |
| 229 | return states # List of DFAState instances; first one is start |
| 230 | |
| 231 | def dump_nfa(self, name, start, finish): |
| 232 | print("Dump of NFA for", name) |
| 233 | todo = [start] |
| 234 | for i, state in enumerate(todo): |
| 235 | print(" State", i, state is finish and "(final)" or "") |
| 236 | for label, next in state.arcs: |
| 237 | if next in todo: |
| 238 | j = todo.index(next) |
| 239 | else: |
| 240 | j = len(todo) |
| 241 | todo.append(next) |
| 242 | if label is None: |
| 243 | print(" -> %d" % j) |
| 244 | else: |
| 245 | print(" %s -> %d" % (label, j)) |
| 246 | |
| 247 | def dump_dfa(self, name, dfa): |
| 248 | print("Dump of DFA for", name) |
| 249 | for i, state in enumerate(dfa): |
| 250 | print(" State", i, state.isfinal and "(final)" or "") |
| 251 | for label, next in sorted(state.arcs.items()): |
| 252 | print(" %s -> %d" % (label, dfa.index(next))) |
| 253 | |
| 254 | def simplify_dfa(self, dfa): |
| 255 | # This is not theoretically optimal, but works well enough. |
| 256 | # Algorithm: repeatedly look for two states that have the same |
| 257 | # set of arcs (same labels pointing to the same nodes) and |
| 258 | # unify them, until things stop changing. |
| 259 | |
| 260 | # dfa is a list of DFAState instances |
| 261 | changes = True |
| 262 | while changes: |
| 263 | changes = False |
| 264 | for i, state_i in enumerate(dfa): |
| 265 | for j in range(i+1, len(dfa)): |
| 266 | state_j = dfa[j] |
| 267 | if state_i == state_j: |
| 268 | #print " unify", i, j |
| 269 | del dfa[j] |
| 270 | for state in dfa: |
| 271 | state.unifystate(state_j, state_i) |
| 272 | changes = True |
| 273 | break |
| 274 | |
| 275 | def parse_rhs(self): |
| 276 | # RHS: ALT ('|' ALT)* |
| 277 | a, z = self.parse_alt() |
| 278 | if self.value != "|": |
| 279 | return a, z |
| 280 | else: |
| 281 | aa = NFAState() |
| 282 | zz = NFAState() |
| 283 | aa.addarc(a) |
| 284 | z.addarc(zz) |
| 285 | while self.value == "|": |
| 286 | self.gettoken() |
| 287 | a, z = self.parse_alt() |
| 288 | aa.addarc(a) |
| 289 | z.addarc(zz) |
| 290 | return aa, zz |
| 291 | |
| 292 | def parse_alt(self): |
| 293 | # ALT: ITEM+ |
| 294 | a, b = self.parse_item() |
| 295 | while (self.value in ("(", "[") or |
| 296 | self.type in (tokenize.NAME, tokenize.STRING)): |
| 297 | c, d = self.parse_item() |
| 298 | b.addarc(c) |
| 299 | b = d |
| 300 | return a, b |
| 301 | |
| 302 | def parse_item(self): |
| 303 | # ITEM: '[' RHS ']' | ATOM ['+' | '*'] |
| 304 | if self.value == "[": |
| 305 | self.gettoken() |
| 306 | a, z = self.parse_rhs() |
| 307 | self.expect(tokenize.OP, "]") |
| 308 | a.addarc(z) |
| 309 | return a, z |
| 310 | else: |
| 311 | a, z = self.parse_atom() |
| 312 | value = self.value |
| 313 | if value not in ("+", "*"): |
| 314 | return a, z |
| 315 | self.gettoken() |
| 316 | z.addarc(a) |
| 317 | if value == "+": |
| 318 | return a, z |
| 319 | else: |
| 320 | return a, a |
| 321 | |
| 322 | def parse_atom(self): |
| 323 | # ATOM: '(' RHS ')' | NAME | STRING |
| 324 | if self.value == "(": |
| 325 | self.gettoken() |
| 326 | a, z = self.parse_rhs() |
| 327 | self.expect(tokenize.OP, ")") |
| 328 | return a, z |
| 329 | elif self.type in (tokenize.NAME, tokenize.STRING): |
| 330 | a = NFAState() |
| 331 | z = NFAState() |
| 332 | a.addarc(z, self.value) |
| 333 | self.gettoken() |
| 334 | return a, z |
| 335 | else: |
| 336 | self.raise_error("expected (...) or NAME or STRING, got %s/%s", |
| 337 | self.type, self.value) |
| 338 | |
| 339 | def expect(self, type, value=None): |
| 340 | if self.type != type or (value is not None and self.value != value): |
| 341 | self.raise_error("expected %s/%s, got %s/%s", |
| 342 | type, value, self.type, self.value) |
| 343 | value = self.value |
| 344 | self.gettoken() |
| 345 | return value |
| 346 | |
| 347 | def gettoken(self): |
| 348 | tup = next(self.generator) |
| 349 | while tup[0] in (tokenize.COMMENT, tokenize.NL): |
| 350 | tup = next(self.generator) |
| 351 | self.type, self.value, self.begin, self.end, self.line = tup |
| 352 | # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value)) |
| 353 | |
| 354 | def raise_error(self, msg, *args): |
| 355 | if args: |
| 356 | try: |
| 357 | msg = msg % args |
| 358 | except: |
| 359 | msg = " ".join([msg] + list(map(str, args))) |
| 360 | raise SyntaxError(msg, (self.filename, self.end[0], |
| 361 | self.end[1], self.line)) |
| 362 | |
| 363 | class NFAState(object): |
| 364 | |
| 365 | def __init__(self): |
| 366 | self.arcs = [] # list of (label, NFAState) pairs |
| 367 | |
| 368 | def addarc(self, next, label=None): |
| 369 | assert label is None or isinstance(label, str) |
| 370 | assert isinstance(next, NFAState) |
| 371 | self.arcs.append((label, next)) |
| 372 | |
| 373 | class DFAState(object): |
| 374 | |
| 375 | def __init__(self, nfaset, final): |
| 376 | assert isinstance(nfaset, set) |
| 377 | assert isinstance(next(iter(nfaset)), NFAState) |
| 378 | assert isinstance(final, NFAState) |
| 379 | self.nfaset = nfaset |
| 380 | self.isfinal = final in nfaset |
| 381 | self.arcs = {} # map from label to DFAState |
| 382 | |
| 383 | def addarc(self, next, label): |
| 384 | assert isinstance(label, str) |
| 385 | assert label not in self.arcs |
| 386 | assert isinstance(next, DFAState) |
| 387 | self.arcs[label] = next |
| 388 | |
| 389 | def unifystate(self, old, new): |
| 390 | for label, next in self.arcs.items(): |
| 391 | if next is old: |
| 392 | self.arcs[label] = new |
| 393 | |
| 394 | def __eq__(self, other): |
| 395 | # Equality test -- ignore the nfaset instance variable |
| 396 | assert isinstance(other, DFAState) |
| 397 | if self.isfinal != other.isfinal: |
| 398 | return False |
| 399 | # Can't just return self.arcs == other.arcs, because that |
| 400 | # would invoke this method recursively, with cycles... |
| 401 | if len(self.arcs) != len(other.arcs): |
| 402 | return False |
| 403 | for label, next in self.arcs.items(): |
| 404 | if next is not other.arcs.get(label): |
| 405 | return False |
| 406 | return True |
| 407 | |
| 408 | __hash__ = None # For Py3 compatibility. |