blob: 75255e9c013827dd3b740eae4c883db01f12abdb [file] [log] [blame]
Martin v. Löwis5e37bae2008-03-19 04:43:46 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)280bc222016-09-08 01:04:37 +000016import collections
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000017import pickle
18
19# Local imports
20from . import token, tokenize
21
22
23class Grammar(object):
Terry Jan Reedya70f60a2013-03-11 17:56:17 -040024 """Pgen parsing tables conversion class.
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000025
26 Once initialized, this class supplies the grammar tables for the
27 parsing engine implemented by parse.py. The parsing engine
28 accesses the instance variables directly. The class here does not
29 provide initialization of the tables; several subclasses exist to
30 do this (see the conv and pgen modules).
31
32 The load() method reads the tables from a pickle file, which is
33 much faster than the other ways offered by subclasses. The pickle
34 file is written by calling dump() (after loading the grammar
35 tables using a subclass). The report() method prints a readable
36 representation of the tables to stdout, for debugging.
37
38 The instance variables are as follows:
39
40 symbol2number -- a dict mapping symbol names to numbers. Symbol
41 numbers are always 256 or higher, to distinguish
42 them from token numbers, which are between 0 and
43 255 (inclusive).
44
45 number2symbol -- a dict mapping numbers to symbol names;
46 these two are each other's inverse.
47
48 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedya70f60a2013-03-11 17:56:17 -040049 states, each state is a list of arcs, and each
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000050 arc is a (i, j) pair where i is a label and j is
51 a state number. The DFA number is the index into
52 this list. (This name is slightly confusing.)
53 Final states are represented by a special arc of
54 the form (0, j) where j is its own state number.
55
56 dfas -- a dict mapping symbol numbers to (DFA, first)
57 pairs, where DFA is an item from the states list
58 above, and first is a set of tokens that can
59 begin this grammar rule (represented by a dict
60 whose values are always 1).
61
62 labels -- a list of (x, y) pairs where x is either a token
63 number or a symbol number, and y is either None
64 or a string; the strings are keywords. The label
65 number is the index in this list; label numbers
66 are used to mark state transitions (arcs) in the
67 DFAs.
68
69 start -- the number of the grammar's start symbol.
70
71 keywords -- a dict mapping keyword strings to arc labels.
72
73 tokens -- a dict mapping token numbers to arc labels.
74
75 """
76
77 def __init__(self):
78 self.symbol2number = {}
79 self.number2symbol = {}
80 self.states = []
81 self.dfas = {}
82 self.labels = [(0, "EMPTY")]
83 self.keywords = {}
84 self.tokens = {}
85 self.symbol2label = {}
86 self.start = 256
87
88 def dump(self, filename):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)280bc222016-09-08 01:04:37 +000089 """Dump the grammar tables to a pickle file.
90
91 dump() recursively changes all dict to OrderedDict, so the pickled file
92 is not exactly the same as what was passed in to dump(). load() uses the
93 pickled file to create the tables, but only changes OrderedDict to dict
94 at the top level; it does not recursively change OrderedDict to dict.
95 So, the loaded tables are different from the original tables that were
96 passed to load() in that some of the OrderedDict (from the pickled file)
97 are not changed back to dict. For parsing, this has no effect on
98 performance because OrderedDict uses dict's __getitem__ with nothing in
99 between.
100 """
101 with open(filename, "wb") as f:
102 d = _make_deterministic(self.__dict__)
103 pickle.dump(d, f, 2)
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000104
105 def load(self, filename):
106 """Load the grammar tables from a pickle file."""
107 f = open(filename, "rb")
108 d = pickle.load(f)
109 f.close()
110 self.__dict__.update(d)
111
Benjamin Peterson840077c2009-07-20 15:33:09 +0000112 def copy(self):
113 """
114 Copy the grammar.
115 """
116 new = self.__class__()
117 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
118 "tokens", "symbol2label"):
119 setattr(new, dict_attr, getattr(self, dict_attr).copy())
120 new.labels = self.labels[:]
121 new.states = self.states[:]
122 new.start = self.start
123 return new
124
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000125 def report(self):
126 """Dump the grammar tables to standard output, for debugging."""
127 from pprint import pprint
128 print "s2n"
129 pprint(self.symbol2number)
130 print "n2s"
131 pprint(self.number2symbol)
132 print "states"
133 pprint(self.states)
134 print "dfas"
135 pprint(self.dfas)
136 print "labels"
137 pprint(self.labels)
138 print "start", self.start
139
140
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)280bc222016-09-08 01:04:37 +0000141def _make_deterministic(top):
142 if isinstance(top, dict):
143 return collections.OrderedDict(
144 sorted(((k, _make_deterministic(v)) for k, v in top.iteritems())))
145 if isinstance(top, list):
146 return [_make_deterministic(e) for e in top]
147 if isinstance(top, tuple):
148 return tuple(_make_deterministic(e) for e in top)
149 return top
150
151
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000152# Map from operator to number (since tokenize doesn't do this)
153
154opmap_raw = """
155( LPAR
156) RPAR
157[ LSQB
158] RSQB
159: COLON
160, COMMA
161; SEMI
162+ PLUS
163- MINUS
164* STAR
165/ SLASH
166| VBAR
167& AMPER
168< LESS
169> GREATER
170= EQUAL
171. DOT
172% PERCENT
173` BACKQUOTE
174{ LBRACE
175} RBRACE
176@ AT
Benjamin Petersonda952f32014-04-10 00:12:47 -0400177@= ATEQUAL
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000178== EQEQUAL
179!= NOTEQUAL
180<> NOTEQUAL
181<= LESSEQUAL
182>= GREATEREQUAL
183~ TILDE
184^ CIRCUMFLEX
185<< LEFTSHIFT
186>> RIGHTSHIFT
187** DOUBLESTAR
188+= PLUSEQUAL
189-= MINEQUAL
190*= STAREQUAL
191/= SLASHEQUAL
192%= PERCENTEQUAL
193&= AMPEREQUAL
194|= VBAREQUAL
195^= CIRCUMFLEXEQUAL
196<<= LEFTSHIFTEQUAL
197>>= RIGHTSHIFTEQUAL
198**= DOUBLESTAREQUAL
199// DOUBLESLASH
200//= DOUBLESLASHEQUAL
201-> RARROW
202"""
203
204opmap = {}
205for line in opmap_raw.splitlines():
206 if line:
207 op, name = line.split()
208 opmap[op] = getattr(token, name)