Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
| 2 | # Licensed to PSF under a Contributor Agreement. |
| 3 | |
| 4 | """This module defines the data structures used to represent a grammar. |
| 5 | |
| 6 | These are a bit arcane because they are derived from the data |
| 7 | structures used by Python's 'pgen' parser generator. |
| 8 | |
| 9 | There's also a table here mapping operators to their names in the |
| 10 | token module; the Python tokenize module reports all operators as the |
| 11 | fallback token code OP, but the parser needs the actual token code. |
| 12 | |
| 13 | """ |
| 14 | |
| 15 | # Python imports |
| 16 | import pickle |
| 17 | |
| 18 | # Local imports |
| 19 | from . import token, tokenize |
| 20 | |
| 21 | |
| 22 | class Grammar(object): |
| 23 | """Pgen parsing tables tables conversion class. |
| 24 | |
| 25 | Once initialized, this class supplies the grammar tables for the |
| 26 | parsing engine implemented by parse.py. The parsing engine |
| 27 | accesses the instance variables directly. The class here does not |
| 28 | provide initialization of the tables; several subclasses exist to |
| 29 | do this (see the conv and pgen modules). |
| 30 | |
| 31 | The load() method reads the tables from a pickle file, which is |
| 32 | much faster than the other ways offered by subclasses. The pickle |
| 33 | file is written by calling dump() (after loading the grammar |
| 34 | tables using a subclass). The report() method prints a readable |
| 35 | representation of the tables to stdout, for debugging. |
| 36 | |
| 37 | The instance variables are as follows: |
| 38 | |
| 39 | symbol2number -- a dict mapping symbol names to numbers. Symbol |
| 40 | numbers are always 256 or higher, to distinguish |
| 41 | them from token numbers, which are between 0 and |
| 42 | 255 (inclusive). |
| 43 | |
| 44 | number2symbol -- a dict mapping numbers to symbol names; |
| 45 | these two are each other's inverse. |
| 46 | |
| 47 | states -- a list of DFAs, where each DFA is a list of |
| 48 | states, each state is is a list of arcs, and each |
| 49 | arc is a (i, j) pair where i is a label and j is |
| 50 | a state number. The DFA number is the index into |
| 51 | this list. (This name is slightly confusing.) |
| 52 | Final states are represented by a special arc of |
| 53 | the form (0, j) where j is its own state number. |
| 54 | |
| 55 | dfas -- a dict mapping symbol numbers to (DFA, first) |
| 56 | pairs, where DFA is an item from the states list |
| 57 | above, and first is a set of tokens that can |
| 58 | begin this grammar rule (represented by a dict |
| 59 | whose values are always 1). |
| 60 | |
| 61 | labels -- a list of (x, y) pairs where x is either a token |
| 62 | number or a symbol number, and y is either None |
| 63 | or a string; the strings are keywords. The label |
| 64 | number is the index in this list; label numbers |
| 65 | are used to mark state transitions (arcs) in the |
| 66 | DFAs. |
| 67 | |
| 68 | start -- the number of the grammar's start symbol. |
| 69 | |
| 70 | keywords -- a dict mapping keyword strings to arc labels. |
| 71 | |
| 72 | tokens -- a dict mapping token numbers to arc labels. |
| 73 | |
| 74 | """ |
| 75 | |
| 76 | def __init__(self): |
| 77 | self.symbol2number = {} |
| 78 | self.number2symbol = {} |
| 79 | self.states = [] |
| 80 | self.dfas = {} |
| 81 | self.labels = [(0, "EMPTY")] |
| 82 | self.keywords = {} |
| 83 | self.tokens = {} |
| 84 | self.symbol2label = {} |
| 85 | self.start = 256 |
| 86 | |
| 87 | def dump(self, filename): |
| 88 | """Dump the grammar tables to a pickle file.""" |
| 89 | f = open(filename, "wb") |
| 90 | pickle.dump(self.__dict__, f, 2) |
| 91 | f.close() |
| 92 | |
| 93 | def load(self, filename): |
| 94 | """Load the grammar tables from a pickle file.""" |
| 95 | f = open(filename, "rb") |
| 96 | d = pickle.load(f) |
| 97 | f.close() |
| 98 | self.__dict__.update(d) |
| 99 | |
| 100 | def report(self): |
| 101 | """Dump the grammar tables to standard output, for debugging.""" |
| 102 | from pprint import pprint |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 103 | print("s2n") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 104 | pprint(self.symbol2number) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 105 | print("n2s") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 106 | pprint(self.number2symbol) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 107 | print("states") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 108 | pprint(self.states) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 109 | print("dfas") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 110 | pprint(self.dfas) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 111 | print("labels") |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 112 | pprint(self.labels) |
Martin v. Löwis | 8a5f8ca | 2008-03-19 05:33:36 +0000 | [diff] [blame] | 113 | print("start", self.start) |
Martin v. Löwis | ef04c44 | 2008-03-19 05:04:44 +0000 | [diff] [blame] | 114 | |
| 115 | |
| 116 | # Map from operator to number (since tokenize doesn't do this) |
| 117 | |
| 118 | opmap_raw = """ |
| 119 | ( LPAR |
| 120 | ) RPAR |
| 121 | [ LSQB |
| 122 | ] RSQB |
| 123 | : COLON |
| 124 | , COMMA |
| 125 | ; SEMI |
| 126 | + PLUS |
| 127 | - MINUS |
| 128 | * STAR |
| 129 | / SLASH |
| 130 | | VBAR |
| 131 | & AMPER |
| 132 | < LESS |
| 133 | > GREATER |
| 134 | = EQUAL |
| 135 | . DOT |
| 136 | % PERCENT |
| 137 | ` BACKQUOTE |
| 138 | { LBRACE |
| 139 | } RBRACE |
| 140 | @ AT |
| 141 | == EQEQUAL |
| 142 | != NOTEQUAL |
| 143 | <> NOTEQUAL |
| 144 | <= LESSEQUAL |
| 145 | >= GREATEREQUAL |
| 146 | ~ TILDE |
| 147 | ^ CIRCUMFLEX |
| 148 | << LEFTSHIFT |
| 149 | >> RIGHTSHIFT |
| 150 | ** DOUBLESTAR |
| 151 | += PLUSEQUAL |
| 152 | -= MINEQUAL |
| 153 | *= STAREQUAL |
| 154 | /= SLASHEQUAL |
| 155 | %= PERCENTEQUAL |
| 156 | &= AMPEREQUAL |
| 157 | |= VBAREQUAL |
| 158 | ^= CIRCUMFLEXEQUAL |
| 159 | <<= LEFTSHIFTEQUAL |
| 160 | >>= RIGHTSHIFTEQUAL |
| 161 | **= DOUBLESTAREQUAL |
| 162 | // DOUBLESLASH |
| 163 | //= DOUBLESLASHEQUAL |
| 164 | -> RARROW |
| 165 | """ |
| 166 | |
| 167 | opmap = {} |
| 168 | for line in opmap_raw.splitlines(): |
| 169 | if line: |
| 170 | op, name = line.split() |
| 171 | opmap[op] = getattr(token, name) |