blob: c10dcfa9ac277708188fa91923645fd706e7d2a9 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +000016import collections
Martin v. Löwisef04c442008-03-19 05:04:44 +000017import pickle
18
19# Local imports
Victor Stinnerd6debb22017-03-27 16:05:26 +020020from . import token
Martin v. Löwisef04c442008-03-19 05:04:44 +000021
22
23class Grammar(object):
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040024 """Pgen parsing tables conversion class.
Martin v. Löwisef04c442008-03-19 05:04:44 +000025
26 Once initialized, this class supplies the grammar tables for the
27 parsing engine implemented by parse.py. The parsing engine
28 accesses the instance variables directly. The class here does not
29 provide initialization of the tables; several subclasses exist to
30 do this (see the conv and pgen modules).
31
32 The load() method reads the tables from a pickle file, which is
33 much faster than the other ways offered by subclasses. The pickle
34 file is written by calling dump() (after loading the grammar
35 tables using a subclass). The report() method prints a readable
36 representation of the tables to stdout, for debugging.
37
38 The instance variables are as follows:
39
40 symbol2number -- a dict mapping symbol names to numbers. Symbol
41 numbers are always 256 or higher, to distinguish
42 them from token numbers, which are between 0 and
43 255 (inclusive).
44
45 number2symbol -- a dict mapping numbers to symbol names;
46 these two are each other's inverse.
47
48 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040049 states, each state is a list of arcs, and each
Martin v. Löwisef04c442008-03-19 05:04:44 +000050 arc is a (i, j) pair where i is a label and j is
51 a state number. The DFA number is the index into
52 this list. (This name is slightly confusing.)
53 Final states are represented by a special arc of
54 the form (0, j) where j is its own state number.
55
56 dfas -- a dict mapping symbol numbers to (DFA, first)
57 pairs, where DFA is an item from the states list
58 above, and first is a set of tokens that can
59 begin this grammar rule (represented by a dict
60 whose values are always 1).
61
62 labels -- a list of (x, y) pairs where x is either a token
63 number or a symbol number, and y is either None
64 or a string; the strings are keywords. The label
65 number is the index in this list; label numbers
66 are used to mark state transitions (arcs) in the
67 DFAs.
68
69 start -- the number of the grammar's start symbol.
70
71 keywords -- a dict mapping keyword strings to arc labels.
72
73 tokens -- a dict mapping token numbers to arc labels.
74
75 """
76
77 def __init__(self):
78 self.symbol2number = {}
79 self.number2symbol = {}
80 self.states = []
81 self.dfas = {}
82 self.labels = [(0, "EMPTY")]
83 self.keywords = {}
84 self.tokens = {}
85 self.symbol2label = {}
86 self.start = 256
87
88 def dump(self, filename):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +000089 """Dump the grammar tables to a pickle file.
90
91 dump() recursively changes all dict to OrderedDict, so the pickled file
92 is not exactly the same as what was passed in to dump(). load() uses the
93 pickled file to create the tables, but only changes OrderedDict to dict
94 at the top level; it does not recursively change OrderedDict to dict.
95 So, the loaded tables are different from the original tables that were
96 passed to load() in that some of the OrderedDict (from the pickled file)
97 are not changed back to dict. For parsing, this has no effect on
98 performance because OrderedDict uses dict's __getitem__ with nothing in
99 between.
100 """
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100101 with open(filename, "wb") as f:
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000102 d = _make_deterministic(self.__dict__)
103 pickle.dump(d, f, 2)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000104
105 def load(self, filename):
106 """Load the grammar tables from a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100107 with open(filename, "rb") as f:
108 d = pickle.load(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000109 self.__dict__.update(d)
110
Benjamin Peterson3059b002009-07-20 16:42:03 +0000111 def copy(self):
112 """
113 Copy the grammar.
114 """
115 new = self.__class__()
116 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
117 "tokens", "symbol2label"):
118 setattr(new, dict_attr, getattr(self, dict_attr).copy())
119 new.labels = self.labels[:]
120 new.states = self.states[:]
121 new.start = self.start
122 return new
123
Martin v. Löwisef04c442008-03-19 05:04:44 +0000124 def report(self):
125 """Dump the grammar tables to standard output, for debugging."""
126 from pprint import pprint
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000127 print("s2n")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000128 pprint(self.symbol2number)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000129 print("n2s")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000130 pprint(self.number2symbol)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000131 print("states")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000132 pprint(self.states)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000133 print("dfas")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000134 pprint(self.dfas)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000135 print("labels")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000136 pprint(self.labels)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000137 print("start", self.start)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000138
139
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000140def _make_deterministic(top):
141 if isinstance(top, dict):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000142 return collections.OrderedDict(
143 sorted(((k, _make_deterministic(v)) for k, v in top.items())))
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000144 if isinstance(top, list):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000145 return [_make_deterministic(e) for e in top]
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000146 if isinstance(top, tuple):
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)97191612016-09-08 00:48:07 +0000147 return tuple(_make_deterministic(e) for e in top)
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +0000148 return top
149
150
Martin v. Löwisef04c442008-03-19 05:04:44 +0000151# Map from operator to number (since tokenize doesn't do this)
152
153opmap_raw = """
154( LPAR
155) RPAR
156[ LSQB
157] RSQB
158: COLON
159, COMMA
160; SEMI
161+ PLUS
162- MINUS
163* STAR
164/ SLASH
165| VBAR
166& AMPER
167< LESS
168> GREATER
169= EQUAL
170. DOT
171% PERCENT
172` BACKQUOTE
173{ LBRACE
174} RBRACE
175@ AT
Benjamin Peterson4ab92c82014-04-10 00:12:47 -0400176@= ATEQUAL
Martin v. Löwisef04c442008-03-19 05:04:44 +0000177== EQEQUAL
178!= NOTEQUAL
179<> NOTEQUAL
180<= LESSEQUAL
181>= GREATEREQUAL
182~ TILDE
183^ CIRCUMFLEX
184<< LEFTSHIFT
185>> RIGHTSHIFT
186** DOUBLESTAR
187+= PLUSEQUAL
188-= MINEQUAL
189*= STAREQUAL
190/= SLASHEQUAL
191%= PERCENTEQUAL
192&= AMPEREQUAL
193|= VBAREQUAL
194^= CIRCUMFLEXEQUAL
195<<= LEFTSHIFTEQUAL
196>>= RIGHTSHIFTEQUAL
197**= DOUBLESTAREQUAL
198// DOUBLESLASH
199//= DOUBLESLASHEQUAL
200-> RARROW
201"""
202
203opmap = {}
204for line in opmap_raw.splitlines():
205 if line:
206 op, name = line.split()
207 opmap[op] = getattr(token, name)