blob: 8220b0aa5e14134ee31b1fa1be1bf77c08200af5 [file] [log] [blame]
Martin v. Löwis5e37bae2008-03-19 04:43:46 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
16import pickle
17
18# Local imports
19from . import token, tokenize
20
21
22class Grammar(object):
Terry Jan Reedya70f60a2013-03-11 17:56:17 -040023 """Pgen parsing tables conversion class.
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000024
25 Once initialized, this class supplies the grammar tables for the
26 parsing engine implemented by parse.py. The parsing engine
27 accesses the instance variables directly. The class here does not
28 provide initialization of the tables; several subclasses exist to
29 do this (see the conv and pgen modules).
30
31 The load() method reads the tables from a pickle file, which is
32 much faster than the other ways offered by subclasses. The pickle
33 file is written by calling dump() (after loading the grammar
34 tables using a subclass). The report() method prints a readable
35 representation of the tables to stdout, for debugging.
36
37 The instance variables are as follows:
38
39 symbol2number -- a dict mapping symbol names to numbers. Symbol
40 numbers are always 256 or higher, to distinguish
41 them from token numbers, which are between 0 and
42 255 (inclusive).
43
44 number2symbol -- a dict mapping numbers to symbol names;
45 these two are each other's inverse.
46
47 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedya70f60a2013-03-11 17:56:17 -040048 states, each state is a list of arcs, and each
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000049 arc is a (i, j) pair where i is a label and j is
50 a state number. The DFA number is the index into
51 this list. (This name is slightly confusing.)
52 Final states are represented by a special arc of
53 the form (0, j) where j is its own state number.
54
55 dfas -- a dict mapping symbol numbers to (DFA, first)
56 pairs, where DFA is an item from the states list
57 above, and first is a set of tokens that can
58 begin this grammar rule (represented by a dict
59 whose values are always 1).
60
61 labels -- a list of (x, y) pairs where x is either a token
62 number or a symbol number, and y is either None
63 or a string; the strings are keywords. The label
64 number is the index in this list; label numbers
65 are used to mark state transitions (arcs) in the
66 DFAs.
67
68 start -- the number of the grammar's start symbol.
69
70 keywords -- a dict mapping keyword strings to arc labels.
71
72 tokens -- a dict mapping token numbers to arc labels.
73
74 """
75
76 def __init__(self):
77 self.symbol2number = {}
78 self.number2symbol = {}
79 self.states = []
80 self.dfas = {}
81 self.labels = [(0, "EMPTY")]
82 self.keywords = {}
83 self.tokens = {}
84 self.symbol2label = {}
85 self.start = 256
86
87 def dump(self, filename):
88 """Dump the grammar tables to a pickle file."""
89 f = open(filename, "wb")
90 pickle.dump(self.__dict__, f, 2)
91 f.close()
92
93 def load(self, filename):
94 """Load the grammar tables from a pickle file."""
95 f = open(filename, "rb")
96 d = pickle.load(f)
97 f.close()
98 self.__dict__.update(d)
99
Benjamin Peterson840077c2009-07-20 15:33:09 +0000100 def copy(self):
101 """
102 Copy the grammar.
103 """
104 new = self.__class__()
105 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
106 "tokens", "symbol2label"):
107 setattr(new, dict_attr, getattr(self, dict_attr).copy())
108 new.labels = self.labels[:]
109 new.states = self.states[:]
110 new.start = self.start
111 return new
112
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000113 def report(self):
114 """Dump the grammar tables to standard output, for debugging."""
115 from pprint import pprint
116 print "s2n"
117 pprint(self.symbol2number)
118 print "n2s"
119 pprint(self.number2symbol)
120 print "states"
121 pprint(self.states)
122 print "dfas"
123 pprint(self.dfas)
124 print "labels"
125 pprint(self.labels)
126 print "start", self.start
127
128
129# Map from operator to number (since tokenize doesn't do this)
130
131opmap_raw = """
132( LPAR
133) RPAR
134[ LSQB
135] RSQB
136: COLON
137, COMMA
138; SEMI
139+ PLUS
140- MINUS
141* STAR
142/ SLASH
143| VBAR
144& AMPER
145< LESS
146> GREATER
147= EQUAL
148. DOT
149% PERCENT
150` BACKQUOTE
151{ LBRACE
152} RBRACE
153@ AT
Benjamin Petersonda952f32014-04-10 00:12:47 -0400154@= ATEQUAL
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000155== EQEQUAL
156!= NOTEQUAL
157<> NOTEQUAL
158<= LESSEQUAL
159>= GREATEREQUAL
160~ TILDE
161^ CIRCUMFLEX
162<< LEFTSHIFT
163>> RIGHTSHIFT
164** DOUBLESTAR
165+= PLUSEQUAL
166-= MINEQUAL
167*= STAREQUAL
168/= SLASHEQUAL
169%= PERCENTEQUAL
170&= AMPEREQUAL
171|= VBAREQUAL
172^= CIRCUMFLEXEQUAL
173<<= LEFTSHIFTEQUAL
174>>= RIGHTSHIFTEQUAL
175**= DOUBLESTAREQUAL
176// DOUBLESLASH
177//= DOUBLESLASHEQUAL
178-> RARROW
179"""
180
181opmap = {}
182for line in opmap_raw.splitlines():
183 if line:
184 op, name = line.split()
185 opmap[op] = getattr(token, name)