blob: c00cb2248d5676092c82d9da3f67d9b2f54f67ad [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)dd1c6382016-09-08 00:40:07 +000016import collections
Martin v. Löwisef04c442008-03-19 05:04:44 +000017import pickle
18
19# Local imports
Victor Stinnerd6debb22017-03-27 16:05:26 +020020from . import token
Martin v. Löwisef04c442008-03-19 05:04:44 +000021
22
23class Grammar(object):
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040024 """Pgen parsing tables conversion class.
Martin v. Löwisef04c442008-03-19 05:04:44 +000025
26 Once initialized, this class supplies the grammar tables for the
27 parsing engine implemented by parse.py. The parsing engine
28 accesses the instance variables directly. The class here does not
29 provide initialization of the tables; several subclasses exist to
30 do this (see the conv and pgen modules).
31
32 The load() method reads the tables from a pickle file, which is
33 much faster than the other ways offered by subclasses. The pickle
34 file is written by calling dump() (after loading the grammar
35 tables using a subclass). The report() method prints a readable
36 representation of the tables to stdout, for debugging.
37
38 The instance variables are as follows:
39
40 symbol2number -- a dict mapping symbol names to numbers. Symbol
41 numbers are always 256 or higher, to distinguish
42 them from token numbers, which are between 0 and
43 255 (inclusive).
44
45 number2symbol -- a dict mapping numbers to symbol names;
46 these two are each other's inverse.
47
48 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040049 states, each state is a list of arcs, and each
Martin v. Löwisef04c442008-03-19 05:04:44 +000050 arc is a (i, j) pair where i is a label and j is
51 a state number. The DFA number is the index into
52 this list. (This name is slightly confusing.)
53 Final states are represented by a special arc of
54 the form (0, j) where j is its own state number.
55
56 dfas -- a dict mapping symbol numbers to (DFA, first)
57 pairs, where DFA is an item from the states list
58 above, and first is a set of tokens that can
59 begin this grammar rule (represented by a dict
60 whose values are always 1).
61
62 labels -- a list of (x, y) pairs where x is either a token
63 number or a symbol number, and y is either None
64 or a string; the strings are keywords. The label
65 number is the index in this list; label numbers
66 are used to mark state transitions (arcs) in the
67 DFAs.
68
69 start -- the number of the grammar's start symbol.
70
71 keywords -- a dict mapping keyword strings to arc labels.
72
73 tokens -- a dict mapping token numbers to arc labels.
74
75 """
76
77 def __init__(self):
78 self.symbol2number = {}
79 self.number2symbol = {}
80 self.states = []
81 self.dfas = {}
82 self.labels = [(0, "EMPTY")]
83 self.keywords = {}
84 self.tokens = {}
85 self.symbol2label = {}
86 self.start = 256
87
88 def dump(self, filename):
Łukasz Langa76618062018-04-16 17:33:59 -070089 """Dump the grammar tables to a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +010090 with open(filename, "wb") as f:
Łukasz Langa76618062018-04-16 17:33:59 -070091 pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
Martin v. Löwisef04c442008-03-19 05:04:44 +000092
93 def load(self, filename):
94 """Load the grammar tables from a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +010095 with open(filename, "rb") as f:
96 d = pickle.load(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +000097 self.__dict__.update(d)
98
Benjamin Peterson8a587712017-12-22 12:18:33 -080099 def loads(self, pkl):
100 """Load the grammar tables from a pickle bytes object."""
101 self.__dict__.update(pickle.loads(pkl))
102
Benjamin Peterson3059b002009-07-20 16:42:03 +0000103 def copy(self):
104 """
105 Copy the grammar.
106 """
107 new = self.__class__()
108 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
109 "tokens", "symbol2label"):
110 setattr(new, dict_attr, getattr(self, dict_attr).copy())
111 new.labels = self.labels[:]
112 new.states = self.states[:]
113 new.start = self.start
114 return new
115
Martin v. Löwisef04c442008-03-19 05:04:44 +0000116 def report(self):
117 """Dump the grammar tables to standard output, for debugging."""
118 from pprint import pprint
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000119 print("s2n")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000120 pprint(self.symbol2number)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000121 print("n2s")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000122 pprint(self.number2symbol)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000123 print("states")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000124 pprint(self.states)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000125 print("dfas")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000126 pprint(self.dfas)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000127 print("labels")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000128 pprint(self.labels)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000129 print("start", self.start)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000130
131
132# Map from operator to number (since tokenize doesn't do this)
133
134opmap_raw = """
135( LPAR
136) RPAR
137[ LSQB
138] RSQB
139: COLON
140, COMMA
141; SEMI
142+ PLUS
143- MINUS
144* STAR
145/ SLASH
146| VBAR
147& AMPER
148< LESS
149> GREATER
150= EQUAL
151. DOT
152% PERCENT
153` BACKQUOTE
154{ LBRACE
155} RBRACE
156@ AT
Benjamin Peterson4ab92c82014-04-10 00:12:47 -0400157@= ATEQUAL
Martin v. Löwisef04c442008-03-19 05:04:44 +0000158== EQEQUAL
159!= NOTEQUAL
160<> NOTEQUAL
161<= LESSEQUAL
162>= GREATEREQUAL
163~ TILDE
164^ CIRCUMFLEX
165<< LEFTSHIFT
166>> RIGHTSHIFT
167** DOUBLESTAR
168+= PLUSEQUAL
169-= MINEQUAL
170*= STAREQUAL
171/= SLASHEQUAL
172%= PERCENTEQUAL
173&= AMPEREQUAL
174|= VBAREQUAL
175^= CIRCUMFLEXEQUAL
176<<= LEFTSHIFTEQUAL
177>>= RIGHTSHIFTEQUAL
178**= DOUBLESTAREQUAL
179// DOUBLESLASH
180//= DOUBLESLASHEQUAL
181-> RARROW
182"""
183
184opmap = {}
185for line in opmap_raw.splitlines():
186 if line:
187 op, name = line.split()
188 opmap[op] = getattr(token, name)