blob: a1da546eee20e12fc29d442835d80bea6379e40c [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
16import pickle
17
18# Local imports
Victor Stinnerd6debb22017-03-27 16:05:26 +020019from . import token
Martin v. Löwisef04c442008-03-19 05:04:44 +000020
21
22class Grammar(object):
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040023 """Pgen parsing tables conversion class.
Martin v. Löwisef04c442008-03-19 05:04:44 +000024
25 Once initialized, this class supplies the grammar tables for the
26 parsing engine implemented by parse.py. The parsing engine
27 accesses the instance variables directly. The class here does not
28 provide initialization of the tables; several subclasses exist to
29 do this (see the conv and pgen modules).
30
31 The load() method reads the tables from a pickle file, which is
32 much faster than the other ways offered by subclasses. The pickle
33 file is written by calling dump() (after loading the grammar
34 tables using a subclass). The report() method prints a readable
35 representation of the tables to stdout, for debugging.
36
37 The instance variables are as follows:
38
39 symbol2number -- a dict mapping symbol names to numbers. Symbol
40 numbers are always 256 or higher, to distinguish
41 them from token numbers, which are between 0 and
42 255 (inclusive).
43
44 number2symbol -- a dict mapping numbers to symbol names;
45 these two are each other's inverse.
46
47 states -- a list of DFAs, where each DFA is a list of
Terry Jan Reedyc30b7b12013-03-11 17:57:08 -040048 states, each state is a list of arcs, and each
Martin v. Löwisef04c442008-03-19 05:04:44 +000049 arc is a (i, j) pair where i is a label and j is
50 a state number. The DFA number is the index into
51 this list. (This name is slightly confusing.)
52 Final states are represented by a special arc of
53 the form (0, j) where j is its own state number.
54
55 dfas -- a dict mapping symbol numbers to (DFA, first)
56 pairs, where DFA is an item from the states list
57 above, and first is a set of tokens that can
58 begin this grammar rule (represented by a dict
59 whose values are always 1).
60
61 labels -- a list of (x, y) pairs where x is either a token
62 number or a symbol number, and y is either None
63 or a string; the strings are keywords. The label
64 number is the index in this list; label numbers
65 are used to mark state transitions (arcs) in the
66 DFAs.
67
68 start -- the number of the grammar's start symbol.
69
70 keywords -- a dict mapping keyword strings to arc labels.
71
72 tokens -- a dict mapping token numbers to arc labels.
73
74 """
75
76 def __init__(self):
77 self.symbol2number = {}
78 self.number2symbol = {}
79 self.states = []
80 self.dfas = {}
81 self.labels = [(0, "EMPTY")]
82 self.keywords = {}
83 self.tokens = {}
84 self.symbol2label = {}
85 self.start = 256
86
87 def dump(self, filename):
Łukasz Langa76618062018-04-16 17:33:59 -070088 """Dump the grammar tables to a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +010089 with open(filename, "wb") as f:
Łukasz Langa76618062018-04-16 17:33:59 -070090 pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
Martin v. Löwisef04c442008-03-19 05:04:44 +000091
92 def load(self, filename):
93 """Load the grammar tables from a pickle file."""
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +010094 with open(filename, "rb") as f:
95 d = pickle.load(f)
Martin v. Löwisef04c442008-03-19 05:04:44 +000096 self.__dict__.update(d)
97
Benjamin Peterson8a587712017-12-22 12:18:33 -080098 def loads(self, pkl):
99 """Load the grammar tables from a pickle bytes object."""
100 self.__dict__.update(pickle.loads(pkl))
101
Benjamin Peterson3059b002009-07-20 16:42:03 +0000102 def copy(self):
103 """
104 Copy the grammar.
105 """
106 new = self.__class__()
107 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
108 "tokens", "symbol2label"):
109 setattr(new, dict_attr, getattr(self, dict_attr).copy())
110 new.labels = self.labels[:]
111 new.states = self.states[:]
112 new.start = self.start
113 return new
114
Martin v. Löwisef04c442008-03-19 05:04:44 +0000115 def report(self):
116 """Dump the grammar tables to standard output, for debugging."""
117 from pprint import pprint
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000118 print("s2n")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000119 pprint(self.symbol2number)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000120 print("n2s")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000121 pprint(self.number2symbol)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000122 print("states")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000123 pprint(self.states)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000124 print("dfas")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000125 pprint(self.dfas)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000126 print("labels")
Martin v. Löwisef04c442008-03-19 05:04:44 +0000127 pprint(self.labels)
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000128 print("start", self.start)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000129
130
131# Map from operator to number (since tokenize doesn't do this)
132
133opmap_raw = """
134( LPAR
135) RPAR
136[ LSQB
137] RSQB
138: COLON
139, COMMA
140; SEMI
141+ PLUS
142- MINUS
143* STAR
144/ SLASH
145| VBAR
146& AMPER
147< LESS
148> GREATER
149= EQUAL
150. DOT
151% PERCENT
152` BACKQUOTE
153{ LBRACE
154} RBRACE
155@ AT
Benjamin Peterson4ab92c82014-04-10 00:12:47 -0400156@= ATEQUAL
Martin v. Löwisef04c442008-03-19 05:04:44 +0000157== EQEQUAL
158!= NOTEQUAL
159<> NOTEQUAL
160<= LESSEQUAL
161>= GREATEREQUAL
162~ TILDE
163^ CIRCUMFLEX
164<< LEFTSHIFT
165>> RIGHTSHIFT
166** DOUBLESTAR
167+= PLUSEQUAL
168-= MINEQUAL
169*= STAREQUAL
170/= SLASHEQUAL
171%= PERCENTEQUAL
172&= AMPEREQUAL
173|= VBAREQUAL
174^= CIRCUMFLEXEQUAL
175<<= LEFTSHIFTEQUAL
176>>= RIGHTSHIFTEQUAL
177**= DOUBLESTAREQUAL
178// DOUBLESLASH
179//= DOUBLESLASHEQUAL
180-> RARROW
181"""
182
183opmap = {}
184for line in opmap_raw.splitlines():
185 if line:
186 op, name = line.split()
187 opmap[op] = getattr(token, name)