blob: 088c58bfa99c1b1b0ea5f269604e919e74685c3c [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +000016import collections
Martin v. Löwisef04c442008-03-19 05:04:44 +000017import pickle
18
19# Local imports
Victor Stinnerd6debb22017-03-27 16:05:26 +020020from . import token
Martin v. Löwisef04c442008-03-19 05:04:44 +000021
22
23class Grammar(object):
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040024 """Pgen parsing tables conversion class.
Martin v. Löwisef04c442008-03-19 05:04:44 +000025
26 Once initialized, this class supplies the grammar tables for the
27 parsing engine implemented by parse.py. The parsing engine
28 accesses the instance variables directly. The class here does not
29 provide initialization of the tables; several subclasses exist to
30 do this (see the conv and pgen modules).
31
32 The load() method reads the tables from a pickle file, which is
33 much faster than the other ways offered by subclasses. The pickle
34 file is written by calling dump() (after loading the grammar
35 tables using a subclass). The report() method prints a readable
36 representation of the tables to stdout, for debugging.
37
38 The instance variables are as follows:
39
40 symbol2number -- a dict mapping symbol names to numbers. Symbol
41 numbers are always 256 or higher, to distinguish
42 them from token numbers, which are between 0 and
43 255 (inclusive).
44
45 number2symbol -- a dict mapping numbers to symbol names;
46 these two are each other's inverse.
47
48 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040049 states, each state is a list of arcs, and each
Martin v. Löwisef04c442008-03-19 05:04:44 +000050 arc is a (i, j) pair where i is a label and j is
51 a state number. The DFA number is the index into
52 this list. (This name is slightly confusing.)
53 Final states are represented by a special arc of
54 the form (0, j) where j is its own state number.
55
56 dfas -- a dict mapping symbol numbers to (DFA, first)
57 pairs, where DFA is an item from the states list
58 above, and first is a set of tokens that can
59 begin this grammar rule (represented by a dict
60 whose values are always 1).
61
62 labels -- a list of (x, y) pairs where x is either a token
63 number or a symbol number, and y is either None
64 or a string; the strings are keywords. The label
65 number is the index in this list; label numbers
66 are used to mark state transitions (arcs) in the
67 DFAs.
68
69 start -- the number of the grammar's start symbol.
70
71 keywords -- a dict mapping keyword strings to arc labels.
72
73 tokens -- a dict mapping token numbers to arc labels.
74
75 """
76
77 def __init__(self):
78 self.symbol2number = {}
79 self.number2symbol = {}
80 self.states = []
81 self.dfas = {}
82 self.labels = [(0, "EMPTY")]
83 self.keywords = {}
84 self.tokens = {}
85 self.symbol2label = {}
86 self.start = 256
87
88 def dump(self, filename):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +000089 """Dump the grammar tables to a pickle file.
90
91 dump() recursively changes all dict to OrderedDict, so the pickled file
92 is not exactly the same as what was passed in to dump(). load() uses the
93 pickled file to create the tables, but only changes OrderedDict to dict
94 at the top level; it does not recursively change OrderedDict to dict.
95 So, the loaded tables are different from the original tables that were
96 passed to load() in that some of the OrderedDict (from the pickled file)
97 are not changed back to dict. For parsing, this has no effect on
98 performance because OrderedDict uses dict's __getitem__ with nothing in
99 between.
100 """
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100101 with open(filename, "wb") as f:
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000102 d = _make_deterministic(self.__dict__)
103 pickle.dump(d, f, 2)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000104
105 def load(self, filename):
106 """Load the grammar tables from a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100107 with open(filename, "rb") as f:
108 d = pickle.load(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000109 self.__dict__.update(d)
110
Benjamin Peterson8a587712017-12-22 12:18:33 -0800111 def loads(self, pkl):
112 """Load the grammar tables from a pickle bytes object."""
113 self.__dict__.update(pickle.loads(pkl))
114
Benjamin Peterson3059b002009-07-20 16:42:03 +0000115 def copy(self):
116 """
117 Copy the grammar.
118 """
119 new = self.__class__()
120 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
121 "tokens", "symbol2label"):
122 setattr(new, dict_attr, getattr(self, dict_attr).copy())
123 new.labels = self.labels[:]
124 new.states = self.states[:]
125 new.start = self.start
126 return new
127
Martin v. Löwisef04c442008-03-19 05:04:44 +0000128 def report(self):
129 """Dump the grammar tables to standard output, for debugging."""
130 from pprint import pprint
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000131 print("s2n")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000132 pprint(self.symbol2number)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000133 print("n2s")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000134 pprint(self.number2symbol)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000135 print("states")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000136 pprint(self.states)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000137 print("dfas")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000138 pprint(self.dfas)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000139 print("labels")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000140 pprint(self.labels)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000141 print("start", self.start)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000142
143
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000144def _make_deterministic(top):
145 if isinstance(top, dict):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000146 return collections.OrderedDict(
147 sorted(((k, _make_deterministic(v)) for k, v in top.items())))
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000148 if isinstance(top, list):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000149 return [_make_deterministic(e) for e in top]
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000150 if isinstance(top, tuple):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000151 return tuple(_make_deterministic(e) for e in top)
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000152 return top
153
154
Martin v. Löwisef04c442008-03-19 05:04:44 +0000155# Map from operator to number (since tokenize doesn't do this)
156
157opmap_raw = """
158( LPAR
159) RPAR
160[ LSQB
161] RSQB
162: COLON
163, COMMA
164; SEMI
165+ PLUS
166- MINUS
167* STAR
168/ SLASH
169| VBAR
170& AMPER
171< LESS
172> GREATER
173= EQUAL
174. DOT
175% PERCENT
176` BACKQUOTE
177{ LBRACE
178} RBRACE
179@ AT
Benjamin Peterson4ab92c82014-04-10 00:12:47 -0400180@= ATEQUAL
Martin v. Löwisef04c442008-03-19 05:04:44 +0000181== EQEQUAL
182!= NOTEQUAL
183<> NOTEQUAL
184<= LESSEQUAL
185>= GREATEREQUAL
186~ TILDE
187^ CIRCUMFLEX
188<< LEFTSHIFT
189>> RIGHTSHIFT
190** DOUBLESTAR
191+= PLUSEQUAL
192-= MINEQUAL
193*= STAREQUAL
194/= SLASHEQUAL
195%= PERCENTEQUAL
196&= AMPEREQUAL
197|= VBAREQUAL
198^= CIRCUMFLEXEQUAL
199<<= LEFTSHIFTEQUAL
200>>= RIGHTSHIFTEQUAL
201**= DOUBLESTAREQUAL
202// DOUBLESLASH
203//= DOUBLESLASHEQUAL
204-> RARROW
205"""
206
207opmap = {}
208for line in opmap_raw.splitlines():
209 if line:
210 op, name = line.split()
211 opmap[op] = getattr(token, name)