Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
| 2 | # Licensed to PSF under a Contributor Agreement. |
| 3 | |
| 4 | # Pgen imports |
Martin v. Löwis | 3faa84f | 2008-03-22 00:07:09 +0000 | [diff] [blame] | 5 | from . import grammar, token, tokenize |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 6 | |
| 7 | class PgenGrammar(grammar.Grammar): |
| 8 | pass |
| 9 | |
| 10 | class ParserGenerator(object): |
| 11 | |
| 12 | def __init__(self, filename, stream=None): |
| 13 | close_stream = None |
| 14 | if stream is None: |
| 15 | stream = open(filename) |
| 16 | close_stream = stream.close |
| 17 | self.filename = filename |
| 18 | self.stream = stream |
| 19 | self.generator = tokenize.generate_tokens(stream.readline) |
| 20 | self.gettoken() # Initialize lookahead |
| 21 | self.dfas, self.startsymbol = self.parse() |
| 22 | if close_stream is not None: |
| 23 | close_stream() |
| 24 | self.first = {} # map from symbol name to set of tokens |
| 25 | self.addfirstsets() |
| 26 | |
| 27 | def make_grammar(self): |
| 28 | c = PgenGrammar() |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 29 | names = list(self.dfas.keys()) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 30 | names.sort() |
| 31 | names.remove(self.startsymbol) |
| 32 | names.insert(0, self.startsymbol) |
| 33 | for name in names: |
| 34 | i = 256 + len(c.symbol2number) |
| 35 | c.symbol2number[name] = i |
| 36 | c.number2symbol[i] = name |
| 37 | for name in names: |
| 38 | dfa = self.dfas[name] |
| 39 | states = [] |
| 40 | for state in dfa: |
| 41 | arcs = [] |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 42 | for label, next in state.arcs.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 43 | arcs.append((self.make_label(c, label), dfa.index(next))) |
| 44 | if state.isfinal: |
| 45 | arcs.append((0, dfa.index(state))) |
| 46 | states.append(arcs) |
| 47 | c.states.append(states) |
| 48 | c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) |
| 49 | c.start = c.symbol2number[self.startsymbol] |
| 50 | return c |
| 51 | |
| 52 | def make_first(self, c, name): |
| 53 | rawfirst = self.first[name] |
| 54 | first = {} |
| 55 | for label in rawfirst: |
| 56 | ilabel = self.make_label(c, label) |
| 57 | ##assert ilabel not in first # XXX failed on <> ... != |
| 58 | first[ilabel] = 1 |
| 59 | return first |
| 60 | |
| 61 | def make_label(self, c, label): |
| 62 | # XXX Maybe this should be a method on a subclass of converter? |
| 63 | ilabel = len(c.labels) |
| 64 | if label[0].isalpha(): |
| 65 | # Either a symbol name or a named token |
| 66 | if label in c.symbol2number: |
| 67 | # A symbol name (a non-terminal) |
| 68 | if label in c.symbol2label: |
| 69 | return c.symbol2label[label] |
| 70 | else: |
| 71 | c.labels.append((c.symbol2number[label], None)) |
| 72 | c.symbol2label[label] = ilabel |
| 73 | return ilabel |
| 74 | else: |
| 75 | # A named token (NAME, NUMBER, STRING) |
| 76 | itoken = getattr(token, label, None) |
| 77 | assert isinstance(itoken, int), label |
| 78 | assert itoken in token.tok_name, label |
| 79 | if itoken in c.tokens: |
| 80 | return c.tokens[itoken] |
| 81 | else: |
| 82 | c.labels.append((itoken, None)) |
| 83 | c.tokens[itoken] = ilabel |
| 84 | return ilabel |
| 85 | else: |
| 86 | # Either a keyword or an operator |
| 87 | assert label[0] in ('"', "'"), label |
| 88 | value = eval(label) |
| 89 | if value[0].isalpha(): |
| 90 | # A keyword |
| 91 | if value in c.keywords: |
| 92 | return c.keywords[value] |
| 93 | else: |
| 94 | c.labels.append((token.NAME, value)) |
| 95 | c.keywords[value] = ilabel |
| 96 | return ilabel |
| 97 | else: |
| 98 | # An operator (any non-numeric token) |
| 99 | itoken = grammar.opmap[value] # Fails if unknown token |
| 100 | if itoken in c.tokens: |
| 101 | return c.tokens[itoken] |
| 102 | else: |
| 103 | c.labels.append((itoken, None)) |
| 104 | c.tokens[itoken] = ilabel |
| 105 | return ilabel |
| 106 | |
| 107 | def addfirstsets(self): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 108 | names = list(self.dfas.keys()) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 109 | names.sort() |
| 110 | for name in names: |
| 111 | if name not in self.first: |
| 112 | self.calcfirst(name) |
| 113 | #print name, self.first[name].keys() |
| 114 | |
| 115 | def calcfirst(self, name): |
| 116 | dfa = self.dfas[name] |
| 117 | self.first[name] = None # dummy to detect left recursion |
| 118 | state = dfa[0] |
| 119 | totalset = {} |
| 120 | overlapcheck = {} |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 121 | for label, next in state.arcs.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 122 | if label in self.dfas: |
| 123 | if label in self.first: |
| 124 | fset = self.first[label] |
| 125 | if fset is None: |
| 126 | raise ValueError("recursion for rule %r" % name) |
| 127 | else: |
| 128 | self.calcfirst(label) |
| 129 | fset = self.first[label] |
| 130 | totalset.update(fset) |
| 131 | overlapcheck[label] = fset |
| 132 | else: |
| 133 | totalset[label] = 1 |
| 134 | overlapcheck[label] = {label: 1} |
| 135 | inverse = {} |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 136 | for label, itsfirst in overlapcheck.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 137 | for symbol in itsfirst: |
| 138 | if symbol in inverse: |
| 139 | raise ValueError("rule %s is ambiguous; %s is in the" |
| 140 | " first sets of %s as well as %s" % |
| 141 | (name, symbol, label, inverse[symbol])) |
| 142 | inverse[symbol] = label |
| 143 | self.first[name] = totalset |
| 144 | |
| 145 | def parse(self): |
| 146 | dfas = {} |
| 147 | startsymbol = None |
| 148 | # MSTART: (NEWLINE | RULE)* ENDMARKER |
| 149 | while self.type != token.ENDMARKER: |
| 150 | while self.type == token.NEWLINE: |
| 151 | self.gettoken() |
| 152 | # RULE: NAME ':' RHS NEWLINE |
| 153 | name = self.expect(token.NAME) |
| 154 | self.expect(token.OP, ":") |
| 155 | a, z = self.parse_rhs() |
| 156 | self.expect(token.NEWLINE) |
| 157 | #self.dump_nfa(name, a, z) |
| 158 | dfa = self.make_dfa(a, z) |
| 159 | #self.dump_dfa(name, dfa) |
| 160 | oldlen = len(dfa) |
| 161 | self.simplify_dfa(dfa) |
| 162 | newlen = len(dfa) |
| 163 | dfas[name] = dfa |
| 164 | #print name, oldlen, newlen |
| 165 | if startsymbol is None: |
| 166 | startsymbol = name |
| 167 | return dfas, startsymbol |
| 168 | |
| 169 | def make_dfa(self, start, finish): |
| 170 | # To turn an NFA into a DFA, we define the states of the DFA |
| 171 | # to correspond to *sets* of states of the NFA. Then do some |
| 172 | # state reduction. Let's represent sets as dicts with 1 for |
| 173 | # values. |
| 174 | assert isinstance(start, NFAState) |
| 175 | assert isinstance(finish, NFAState) |
| 176 | def closure(state): |
| 177 | base = {} |
| 178 | addclosure(state, base) |
| 179 | return base |
| 180 | def addclosure(state, base): |
| 181 | assert isinstance(state, NFAState) |
| 182 | if state in base: |
| 183 | return |
| 184 | base[state] = 1 |
| 185 | for label, next in state.arcs: |
| 186 | if label is None: |
| 187 | addclosure(next, base) |
| 188 | states = [DFAState(closure(start), finish)] |
| 189 | for state in states: # NB states grows while we're iterating |
| 190 | arcs = {} |
| 191 | for nfastate in state.nfaset: |
| 192 | for label, next in nfastate.arcs: |
| 193 | if label is not None: |
| 194 | addclosure(next, arcs.setdefault(label, {})) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 195 | for label, nfaset in arcs.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 196 | for st in states: |
| 197 | if st.nfaset == nfaset: |
| 198 | break |
| 199 | else: |
| 200 | st = DFAState(nfaset, finish) |
| 201 | states.append(st) |
| 202 | state.addarc(st, label) |
| 203 | return states # List of DFAState instances; first one is start |
| 204 | |
| 205 | def dump_nfa(self, name, start, finish): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 206 | print("Dump of NFA for", name) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 207 | todo = [start] |
| 208 | for i, state in enumerate(todo): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 209 | print(" State", i, state is finish and "(final)" or "") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 210 | for label, next in state.arcs: |
| 211 | if next in todo: |
| 212 | j = todo.index(next) |
| 213 | else: |
| 214 | j = len(todo) |
| 215 | todo.append(next) |
| 216 | if label is None: |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 217 | print(" -> %d" % j) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 218 | else: |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 219 | print(" %s -> %d" % (label, j)) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 220 | |
| 221 | def dump_dfa(self, name, dfa): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 222 | print("Dump of DFA for", name) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 223 | for i, state in enumerate(dfa): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 224 | print(" State", i, state.isfinal and "(final)" or "") |
| 225 | for label, next in state.arcs.items(): |
| 226 | print(" %s -> %d" % (label, dfa.index(next))) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 227 | |
| 228 | def simplify_dfa(self, dfa): |
| 229 | # This is not theoretically optimal, but works well enough. |
| 230 | # Algorithm: repeatedly look for two states that have the same |
| 231 | # set of arcs (same labels pointing to the same nodes) and |
| 232 | # unify them, until things stop changing. |
| 233 | |
| 234 | # dfa is a list of DFAState instances |
| 235 | changes = True |
| 236 | while changes: |
| 237 | changes = False |
| 238 | for i, state_i in enumerate(dfa): |
| 239 | for j in range(i+1, len(dfa)): |
| 240 | state_j = dfa[j] |
| 241 | if state_i == state_j: |
| 242 | #print " unify", i, j |
| 243 | del dfa[j] |
| 244 | for state in dfa: |
| 245 | state.unifystate(state_j, state_i) |
| 246 | changes = True |
| 247 | break |
| 248 | |
| 249 | def parse_rhs(self): |
| 250 | # RHS: ALT ('|' ALT)* |
| 251 | a, z = self.parse_alt() |
| 252 | if self.value != "|": |
| 253 | return a, z |
| 254 | else: |
| 255 | aa = NFAState() |
| 256 | zz = NFAState() |
| 257 | aa.addarc(a) |
| 258 | z.addarc(zz) |
| 259 | while self.value == "|": |
| 260 | self.gettoken() |
| 261 | a, z = self.parse_alt() |
| 262 | aa.addarc(a) |
| 263 | z.addarc(zz) |
| 264 | return aa, zz |
| 265 | |
| 266 | def parse_alt(self): |
| 267 | # ALT: ITEM+ |
| 268 | a, b = self.parse_item() |
| 269 | while (self.value in ("(", "[") or |
| 270 | self.type in (token.NAME, token.STRING)): |
| 271 | c, d = self.parse_item() |
| 272 | b.addarc(c) |
| 273 | b = d |
| 274 | return a, b |
| 275 | |
| 276 | def parse_item(self): |
| 277 | # ITEM: '[' RHS ']' | ATOM ['+' | '*'] |
| 278 | if self.value == "[": |
| 279 | self.gettoken() |
| 280 | a, z = self.parse_rhs() |
| 281 | self.expect(token.OP, "]") |
| 282 | a.addarc(z) |
| 283 | return a, z |
| 284 | else: |
| 285 | a, z = self.parse_atom() |
| 286 | value = self.value |
| 287 | if value not in ("+", "*"): |
| 288 | return a, z |
| 289 | self.gettoken() |
| 290 | z.addarc(a) |
| 291 | if value == "+": |
| 292 | return a, z |
| 293 | else: |
| 294 | return a, a |
| 295 | |
| 296 | def parse_atom(self): |
| 297 | # ATOM: '(' RHS ')' | NAME | STRING |
| 298 | if self.value == "(": |
| 299 | self.gettoken() |
| 300 | a, z = self.parse_rhs() |
| 301 | self.expect(token.OP, ")") |
| 302 | return a, z |
| 303 | elif self.type in (token.NAME, token.STRING): |
| 304 | a = NFAState() |
| 305 | z = NFAState() |
| 306 | a.addarc(z, self.value) |
| 307 | self.gettoken() |
| 308 | return a, z |
| 309 | else: |
| 310 | self.raise_error("expected (...) or NAME or STRING, got %s/%s", |
| 311 | self.type, self.value) |
| 312 | |
| 313 | def expect(self, type, value=None): |
| 314 | if self.type != type or (value is not None and self.value != value): |
| 315 | self.raise_error("expected %s/%s, got %s/%s", |
| 316 | type, value, self.type, self.value) |
| 317 | value = self.value |
| 318 | self.gettoken() |
| 319 | return value |
| 320 | |
| 321 | def gettoken(self): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 322 | tup = next(self.generator) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 323 | while tup[0] in (tokenize.COMMENT, tokenize.NL): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 324 | tup = next(self.generator) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 325 | self.type, self.value, self.begin, self.end, self.line = tup |
| 326 | #print token.tok_name[self.type], repr(self.value) |
| 327 | |
| 328 | def raise_error(self, msg, *args): |
| 329 | if args: |
| 330 | try: |
| 331 | msg = msg % args |
| 332 | except: |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 333 | msg = " ".join([msg] + list(map(str, args))) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 334 | raise SyntaxError(msg, (self.filename, self.end[0], |
| 335 | self.end[1], self.line)) |
| 336 | |
| 337 | class NFAState(object): |
| 338 | |
| 339 | def __init__(self): |
| 340 | self.arcs = [] # list of (label, NFAState) pairs |
| 341 | |
| 342 | def addarc(self, next, label=None): |
| 343 | assert label is None or isinstance(label, str) |
| 344 | assert isinstance(next, NFAState) |
| 345 | self.arcs.append((label, next)) |
| 346 | |
| 347 | class DFAState(object): |
| 348 | |
| 349 | def __init__(self, nfaset, final): |
| 350 | assert isinstance(nfaset, dict) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 351 | assert isinstance(next(iter(nfaset)), NFAState) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 352 | assert isinstance(final, NFAState) |
| 353 | self.nfaset = nfaset |
| 354 | self.isfinal = final in nfaset |
| 355 | self.arcs = {} # map from label to DFAState |
| 356 | |
| 357 | def addarc(self, next, label): |
| 358 | assert isinstance(label, str) |
| 359 | assert label not in self.arcs |
| 360 | assert isinstance(next, DFAState) |
| 361 | self.arcs[label] = next |
| 362 | |
| 363 | def unifystate(self, old, new): |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 364 | for label, next in self.arcs.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 365 | if next is old: |
| 366 | self.arcs[label] = new |
| 367 | |
| 368 | def __eq__(self, other): |
| 369 | # Equality test -- ignore the nfaset instance variable |
| 370 | assert isinstance(other, DFAState) |
| 371 | if self.isfinal != other.isfinal: |
| 372 | return False |
| 373 | # Can't just return self.arcs == other.arcs, because that |
| 374 | # would invoke this method recursively, with cycles... |
| 375 | if len(self.arcs) != len(other.arcs): |
| 376 | return False |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 377 | for label, next in self.arcs.items(): |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 378 | if next is not other.arcs.get(label): |
| 379 | return False |
| 380 | return True |
| 381 | |
| 382 | def generate_grammar(filename="Grammar.txt"): |
| 383 | p = ParserGenerator(filename) |
| 384 | return p.make_grammar() |