blob: 5416377100c64a48a5fd63fc31017d93507463e6 [file] [log] [blame]
Eli Bendersky5e3d3382014-05-09 17:58:22 -07001#-------------------------------------------------------------------------------
2# Parser for ASDL [1] definition files. Reads in an ASDL description and parses
3# it into an AST that describes it.
4#
5# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support
6# modules and attributes after a product. Words starting with Capital letters
7# are terminals. Literal tokens are in "double quotes". Others are
8# non-terminals. Id is either TokenId or ConstructorId.
9#
10# module ::= "module" Id "{" [definitions] "}"
11# definitions ::= { TypeId "=" type }
12# type ::= product | sum
13# product ::= fields ["attributes" fields]
14# fields ::= "(" { field, "," } field ")"
15# field ::= TypeId ["?" | "*"] [Id]
16# sum ::= constructor { "|" constructor } ["attributes" fields]
17# constructor ::= ConstructorId [fields]
18#
19# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See
20# http://asdl.sourceforge.net/
21#-------------------------------------------------------------------------------
22from collections import namedtuple
23import re
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000024
Eli Bendersky5e3d3382014-05-09 17:58:22 -070025__all__ = [
26 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor',
27 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check']
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000028
Eli Bendersky5e3d3382014-05-09 17:58:22 -070029# The following classes define nodes into which the ASDL description is parsed.
30# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST
31# structure used by a programming language. But ASDL files themselves need to be
32# parsed. This module parses ASDL files and uses a simple AST to represent them.
33# See the EBNF at the top of the file to understand the logical connection
34# between the various node types.
Martin v. Löwiseae93b72006-02-28 00:12:47 +000035
Victor Stinnerf2c1aa12016-01-26 00:40:57 +010036builtin_types = {'identifier', 'string', 'bytes', 'int', 'object', 'singleton',
37 'constant'}
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000038
Eli Bendersky5e3d3382014-05-09 17:58:22 -070039class AST:
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000040 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -070041 raise NotImplementedError
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000042
43class Module(AST):
Benjamin Peterson6cb2b922011-03-12 18:28:16 -060044 def __init__(self, name, dfns):
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000045 self.name = name
46 self.dfns = dfns
Eli Bendersky5e3d3382014-05-09 17:58:22 -070047 self.types = {type.name: type.value for type in dfns}
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000048
49 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -070050 return 'Module({0.name}, {0.dfns})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000051
52class Type(AST):
53 def __init__(self, name, value):
54 self.name = name
55 self.value = value
56
57 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -070058 return 'Type({0.name}, {0.value})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000059
60class Constructor(AST):
61 def __init__(self, name, fields=None):
62 self.name = name
63 self.fields = fields or []
64
65 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -070066 return 'Constructor({0.name}, {0.fields})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000067
68class Field(AST):
Benjamin Peterson87c8d872009-06-11 22:54:11 +000069 def __init__(self, type, name=None, seq=False, opt=False):
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000070 self.type = type
71 self.name = name
72 self.seq = seq
73 self.opt = opt
74
Batuhan Taşkaya4ab362c2020-03-16 11:12:53 +030075 def __str__(self):
76 if self.seq:
77 extra = "*"
78 elif self.opt:
79 extra = "?"
80 else:
81 extra = ""
82
83 return "{}{} {}".format(self.type, extra, self.name)
84
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000085 def __repr__(self):
86 if self.seq:
Benjamin Peterson87c8d872009-06-11 22:54:11 +000087 extra = ", seq=True"
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000088 elif self.opt:
Benjamin Peterson87c8d872009-06-11 22:54:11 +000089 extra = ", opt=True"
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000090 else:
91 extra = ""
92 if self.name is None:
Eli Bendersky5e3d3382014-05-09 17:58:22 -070093 return 'Field({0.type}{1})'.format(self, extra)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000094 else:
Eli Bendersky5e3d3382014-05-09 17:58:22 -070095 return 'Field({0.type}, {0.name}{1})'.format(self, extra)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +000096
97class Sum(AST):
98 def __init__(self, types, attributes=None):
99 self.types = types
100 self.attributes = attributes or []
101
102 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700103 if self.attributes:
104 return 'Sum({0.types}, {0.attributes})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000105 else:
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700106 return 'Sum({0.types})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000107
108class Product(AST):
Benjamin Petersoncda75be2013-03-18 10:48:58 -0700109 def __init__(self, fields, attributes=None):
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000110 self.fields = fields
Benjamin Petersoncda75be2013-03-18 10:48:58 -0700111 self.attributes = attributes or []
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000112
113 def __repr__(self):
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700114 if self.attributes:
115 return 'Product({0.fields}, {0.attributes})'.format(self)
Benjamin Petersoncda75be2013-03-18 10:48:58 -0700116 else:
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700117 return 'Product({0.fields})'.format(self)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000118
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700119# A generic visitor for the meta-AST that describes ASDL. This can be used by
120# emitters. Note that this visitor does not provide a generic visit method, so a
121# subclass needs to define visit methods from visitModule to as deep as the
122# interesting node.
123# We also define a Check visitor that makes sure the parsed ASDL is well-formed.
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000124
Guido van Rossum416b5162014-07-08 16:22:48 -0700125class VisitorBase(object):
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700126 """Generic tree visitor for ASTs."""
127 def __init__(self):
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000128 self.cache = {}
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000129
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700130 def visit(self, obj, *args):
131 klass = obj.__class__
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000132 meth = self.cache.get(klass)
133 if meth is None:
134 methname = "visit" + klass.__name__
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700135 meth = getattr(self, methname, None)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000136 self.cache[klass] = meth
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700137 if meth:
138 try:
139 meth(obj, *args)
140 except Exception as e:
141 print("Error visiting %r: %s" % (obj, e))
142 raise
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000143
144class Check(VisitorBase):
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700145 """A visitor that checks a parsed ASDL tree for correctness.
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000146
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700147 Errors are printed and accumulated.
148 """
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000149 def __init__(self):
Guido van Rossum416b5162014-07-08 16:22:48 -0700150 super(Check, self).__init__()
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000151 self.cons = {}
152 self.errors = 0
153 self.types = {}
154
155 def visitModule(self, mod):
156 for dfn in mod.dfns:
157 self.visit(dfn)
158
159 def visitType(self, type):
160 self.visit(type.value, str(type.name))
161
162 def visitSum(self, sum, name):
163 for t in sum.types:
164 self.visit(t, name)
165
166 def visitConstructor(self, cons, name):
167 key = str(cons.name)
168 conflict = self.cons.get(key)
169 if conflict is None:
170 self.cons[key] = name
171 else:
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700172 print('Redefinition of constructor {}'.format(key))
173 print('Defined in {} and {}'.format(conflict, name))
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000174 self.errors += 1
175 for f in cons.fields:
176 self.visit(f, key)
177
178 def visitField(self, field, name):
179 key = str(field.type)
180 l = self.types.setdefault(key, [])
181 l.append(name)
182
183 def visitProduct(self, prod, name):
184 for f in prod.fields:
185 self.visit(f, name)
186
187def check(mod):
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700188 """Check the parsed ASDL tree for correctness.
189
190 Return True if success. For failure, the errors are printed out and False
191 is returned.
192 """
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000193 v = Check()
194 v.visit(mod)
195
196 for t in v.types:
Fred Drake34e4f522006-12-29 04:42:48 +0000197 if t not in mod.types and not t in builtin_types:
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000198 v.errors += 1
199 uses = ", ".join(v.types[t])
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700200 print('Undefined type {}, used in {}'.format(t, uses))
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000201 return not v.errors
202
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700203# The ASDL parser itself comes next. The only interesting external interface
204# here is the top-level parse function.
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000205
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700206def parse(filename):
207 """Parse ASDL from the given file and return a Module node describing it."""
208 with open(filename) as f:
209 parser = ASDLParser()
210 return parser.parse(f.read())
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000211
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700212# Types for describing tokens in an ASDL specification.
213class TokenKind:
214 """TokenKind is provides a scope for enumerated token kinds."""
215 (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk,
216 LParen, RParen, LBrace, RBrace) = range(11)
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000217
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700218 operator_table = {
219 '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen,
220 ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace}
Tim Peters536cf992005-12-25 23:18:31 +0000221
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700222Token = namedtuple('Token', 'kind value lineno')
223
224class ASDLSyntaxError(Exception):
225 def __init__(self, msg, lineno=None):
226 self.msg = msg
227 self.lineno = lineno or '<unknown>'
228
229 def __str__(self):
230 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
231
232def tokenize_asdl(buf):
233 """Tokenize the given buffer. Yield Token objects."""
234 for lineno, line in enumerate(buf.splitlines(), 1):
235 for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()):
236 c = m.group(1)
237 if c[0].isalpha():
238 # Some kind of identifier
239 if c[0].isupper():
240 yield Token(TokenKind.ConstructorId, c, lineno)
241 else:
242 yield Token(TokenKind.TypeId, c, lineno)
243 elif c[:2] == '--':
244 # Comment
245 break
246 else:
247 # Operators
248 try:
249 op_kind = TokenKind.operator_table[c]
250 except KeyError:
251 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
252 yield Token(op_kind, c, lineno)
253
254class ASDLParser:
255 """Parser for ASDL files.
256
257 Create, then call the parse method on a buffer containing ASDL.
258 This is a simple recursive descent parser that uses tokenize_asdl for the
259 lexing.
260 """
261 def __init__(self):
262 self._tokenizer = None
263 self.cur_token = None
264
265 def parse(self, buf):
266 """Parse the ASDL in the buffer and return an AST with a Module root.
267 """
268 self._tokenizer = tokenize_asdl(buf)
269 self._advance()
270 return self._parse_module()
271
272 def _parse_module(self):
273 if self._at_keyword('module'):
274 self._advance()
Jeremy Hylton3e0055f2005-10-20 19:59:25 +0000275 else:
Eli Bendersky5e3d3382014-05-09 17:58:22 -0700276 raise ASDLSyntaxError(
277 'Expected "module" (found {})'.format(self.cur_token.value),
278 self.cur_token.lineno)
279 name = self._match(self._id_kinds)
280 self._match(TokenKind.LBrace)
281 defs = self._parse_definitions()
282 self._match(TokenKind.RBrace)
283 return Module(name, defs)
284
285 def _parse_definitions(self):
286 defs = []
287 while self.cur_token.kind == TokenKind.TypeId:
288 typename = self._advance()
289 self._match(TokenKind.Equals)
290 type = self._parse_type()
291 defs.append(Type(typename, type))
292 return defs
293
294 def _parse_type(self):
295 if self.cur_token.kind == TokenKind.LParen:
296 # If we see a (, it's a product
297 return self._parse_product()
298 else:
299 # Otherwise it's a sum. Look for ConstructorId
300 sumlist = [Constructor(self._match(TokenKind.ConstructorId),
301 self._parse_optional_fields())]
302 while self.cur_token.kind == TokenKind.Pipe:
303 # More constructors
304 self._advance()
305 sumlist.append(Constructor(
306 self._match(TokenKind.ConstructorId),
307 self._parse_optional_fields()))
308 return Sum(sumlist, self._parse_optional_attributes())
309
310 def _parse_product(self):
311 return Product(self._parse_fields(), self._parse_optional_attributes())
312
313 def _parse_fields(self):
314 fields = []
315 self._match(TokenKind.LParen)
316 while self.cur_token.kind == TokenKind.TypeId:
317 typename = self._advance()
318 is_seq, is_opt = self._parse_optional_field_quantifier()
319 id = (self._advance() if self.cur_token.kind in self._id_kinds
320 else None)
321 fields.append(Field(typename, id, seq=is_seq, opt=is_opt))
322 if self.cur_token.kind == TokenKind.RParen:
323 break
324 elif self.cur_token.kind == TokenKind.Comma:
325 self._advance()
326 self._match(TokenKind.RParen)
327 return fields
328
329 def _parse_optional_fields(self):
330 if self.cur_token.kind == TokenKind.LParen:
331 return self._parse_fields()
332 else:
333 return None
334
335 def _parse_optional_attributes(self):
336 if self._at_keyword('attributes'):
337 self._advance()
338 return self._parse_fields()
339 else:
340 return None
341
342 def _parse_optional_field_quantifier(self):
343 is_seq, is_opt = False, False
344 if self.cur_token.kind == TokenKind.Asterisk:
345 is_seq = True
346 self._advance()
347 elif self.cur_token.kind == TokenKind.Question:
348 is_opt = True
349 self._advance()
350 return is_seq, is_opt
351
352 def _advance(self):
353 """ Return the value of the current token and read the next one into
354 self.cur_token.
355 """
356 cur_val = None if self.cur_token is None else self.cur_token.value
357 try:
358 self.cur_token = next(self._tokenizer)
359 except StopIteration:
360 self.cur_token = None
361 return cur_val
362
363 _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId)
364
365 def _match(self, kind):
366 """The 'match' primitive of RD parsers.
367
368 * Verifies that the current token is of the given kind (kind can
369 be a tuple, in which the kind must match one of its members).
370 * Returns the value of the current token
371 * Reads in the next token
372 """
373 if (isinstance(kind, tuple) and self.cur_token.kind in kind or
374 self.cur_token.kind == kind
375 ):
376 value = self.cur_token.value
377 self._advance()
378 return value
379 else:
380 raise ASDLSyntaxError(
381 'Unmatched {} (found {})'.format(kind, self.cur_token.kind),
382 self.cur_token.lineno)
383
384 def _at_keyword(self, keyword):
385 return (self.cur_token.kind == TokenKind.TypeId and
386 self.cur_token.value == keyword)