| #------------------------------------------------------------------------------- |
| # Parser for ASDL [1] definition files. Reads in an ASDL description and parses |
| # it into an AST that describes it. |
| # |
| # The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support |
| # modules and attributes after a product. Words starting with Capital letters |
| # are terminals. Literal tokens are in "double quotes". Others are |
| # non-terminals. Id is either TokenId or ConstructorId. |
| # |
| # module ::= "module" Id "{" [definitions] "}" |
| # definitions ::= { TypeId "=" type } |
| # type ::= product | sum |
| # product ::= fields ["attributes" fields] |
| # fields ::= "(" { field, "," } field ")" |
| # field ::= TypeId ["?" | "*"] [Id] |
| # sum ::= constructor { "|" constructor } ["attributes" fields] |
| # constructor ::= ConstructorId [fields] |
| # |
| # [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See |
| # http://asdl.sourceforge.net/ |
| #------------------------------------------------------------------------------- |
| from collections import namedtuple |
| import re |
| |
| __all__ = [ |
| 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', |
| 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] |
| |
| # The following classes define nodes into which the ASDL description is parsed. |
| # Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST |
| # structure used by a programming language. But ASDL files themselves need to be |
| # parsed. This module parses ASDL files and uses a simple AST to represent them. |
| # See the EBNF at the top of the file to understand the logical connection |
| # between the various node types. |
| |
| builtin_types = {'identifier', 'string', 'int', 'constant'} |
| |
| class AST: |
| def __repr__(self): |
| raise NotImplementedError |
| |
| class Module(AST): |
| def __init__(self, name, dfns): |
| self.name = name |
| self.dfns = dfns |
| self.types = {type.name: type.value for type in dfns} |
| |
| def __repr__(self): |
| return 'Module({0.name}, {0.dfns})'.format(self) |
| |
| class Type(AST): |
| def __init__(self, name, value): |
| self.name = name |
| self.value = value |
| |
| def __repr__(self): |
| return 'Type({0.name}, {0.value})'.format(self) |
| |
| class Constructor(AST): |
| def __init__(self, name, fields=None): |
| self.name = name |
| self.fields = fields or [] |
| |
| def __repr__(self): |
| return 'Constructor({0.name}, {0.fields})'.format(self) |
| |
| class Field(AST): |
| def __init__(self, type, name=None, seq=False, opt=False): |
| self.type = type |
| self.name = name |
| self.seq = seq |
| self.opt = opt |
| |
| def __str__(self): |
| if self.seq: |
| extra = "*" |
| elif self.opt: |
| extra = "?" |
| else: |
| extra = "" |
| |
| return "{}{} {}".format(self.type, extra, self.name) |
| |
| def __repr__(self): |
| if self.seq: |
| extra = ", seq=True" |
| elif self.opt: |
| extra = ", opt=True" |
| else: |
| extra = "" |
| if self.name is None: |
| return 'Field({0.type}{1})'.format(self, extra) |
| else: |
| return 'Field({0.type}, {0.name}{1})'.format(self, extra) |
| |
| class Sum(AST): |
| def __init__(self, types, attributes=None): |
| self.types = types |
| self.attributes = attributes or [] |
| |
| def __repr__(self): |
| if self.attributes: |
| return 'Sum({0.types}, {0.attributes})'.format(self) |
| else: |
| return 'Sum({0.types})'.format(self) |
| |
| class Product(AST): |
| def __init__(self, fields, attributes=None): |
| self.fields = fields |
| self.attributes = attributes or [] |
| |
| def __repr__(self): |
| if self.attributes: |
| return 'Product({0.fields}, {0.attributes})'.format(self) |
| else: |
| return 'Product({0.fields})'.format(self) |
| |
| # A generic visitor for the meta-AST that describes ASDL. This can be used by |
| # emitters. Note that this visitor does not provide a generic visit method, so a |
| # subclass needs to define visit methods from visitModule to as deep as the |
| # interesting node. |
| # We also define a Check visitor that makes sure the parsed ASDL is well-formed. |
| |
| class VisitorBase(object): |
| """Generic tree visitor for ASTs.""" |
| def __init__(self): |
| self.cache = {} |
| |
| def visit(self, obj, *args): |
| klass = obj.__class__ |
| meth = self.cache.get(klass) |
| if meth is None: |
| methname = "visit" + klass.__name__ |
| meth = getattr(self, methname, None) |
| self.cache[klass] = meth |
| if meth: |
| try: |
| meth(obj, *args) |
| except Exception as e: |
| print("Error visiting %r: %s" % (obj, e)) |
| raise |
| |
| class Check(VisitorBase): |
| """A visitor that checks a parsed ASDL tree for correctness. |
| |
| Errors are printed and accumulated. |
| """ |
| def __init__(self): |
| super(Check, self).__init__() |
| self.cons = {} |
| self.errors = 0 |
| self.types = {} |
| |
| def visitModule(self, mod): |
| for dfn in mod.dfns: |
| self.visit(dfn) |
| |
| def visitType(self, type): |
| self.visit(type.value, str(type.name)) |
| |
| def visitSum(self, sum, name): |
| for t in sum.types: |
| self.visit(t, name) |
| |
| def visitConstructor(self, cons, name): |
| key = str(cons.name) |
| conflict = self.cons.get(key) |
| if conflict is None: |
| self.cons[key] = name |
| else: |
| print('Redefinition of constructor {}'.format(key)) |
| print('Defined in {} and {}'.format(conflict, name)) |
| self.errors += 1 |
| for f in cons.fields: |
| self.visit(f, key) |
| |
| def visitField(self, field, name): |
| key = str(field.type) |
| l = self.types.setdefault(key, []) |
| l.append(name) |
| |
| def visitProduct(self, prod, name): |
| for f in prod.fields: |
| self.visit(f, name) |
| |
| def check(mod): |
| """Check the parsed ASDL tree for correctness. |
| |
| Return True if success. For failure, the errors are printed out and False |
| is returned. |
| """ |
| v = Check() |
| v.visit(mod) |
| |
| for t in v.types: |
| if t not in mod.types and not t in builtin_types: |
| v.errors += 1 |
| uses = ", ".join(v.types[t]) |
| print('Undefined type {}, used in {}'.format(t, uses)) |
| return not v.errors |
| |
| # The ASDL parser itself comes next. The only interesting external interface |
| # here is the top-level parse function. |
| |
| def parse(filename): |
| """Parse ASDL from the given file and return a Module node describing it.""" |
| with open(filename) as f: |
| parser = ASDLParser() |
| return parser.parse(f.read()) |
| |
| # Types for describing tokens in an ASDL specification. |
| class TokenKind: |
| """TokenKind is provides a scope for enumerated token kinds.""" |
| (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, |
| LParen, RParen, LBrace, RBrace) = range(11) |
| |
| operator_table = { |
| '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, |
| ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} |
| |
| Token = namedtuple('Token', 'kind value lineno') |
| |
| class ASDLSyntaxError(Exception): |
| def __init__(self, msg, lineno=None): |
| self.msg = msg |
| self.lineno = lineno or '<unknown>' |
| |
| def __str__(self): |
| return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) |
| |
| def tokenize_asdl(buf): |
| """Tokenize the given buffer. Yield Token objects.""" |
| for lineno, line in enumerate(buf.splitlines(), 1): |
| for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): |
| c = m.group(1) |
| if c[0].isalpha(): |
| # Some kind of identifier |
| if c[0].isupper(): |
| yield Token(TokenKind.ConstructorId, c, lineno) |
| else: |
| yield Token(TokenKind.TypeId, c, lineno) |
| elif c[:2] == '--': |
| # Comment |
| break |
| else: |
| # Operators |
| try: |
| op_kind = TokenKind.operator_table[c] |
| except KeyError: |
| raise ASDLSyntaxError('Invalid operator %s' % c, lineno) |
| yield Token(op_kind, c, lineno) |
| |
| class ASDLParser: |
| """Parser for ASDL files. |
| |
| Create, then call the parse method on a buffer containing ASDL. |
| This is a simple recursive descent parser that uses tokenize_asdl for the |
| lexing. |
| """ |
| def __init__(self): |
| self._tokenizer = None |
| self.cur_token = None |
| |
| def parse(self, buf): |
| """Parse the ASDL in the buffer and return an AST with a Module root. |
| """ |
| self._tokenizer = tokenize_asdl(buf) |
| self._advance() |
| return self._parse_module() |
| |
| def _parse_module(self): |
| if self._at_keyword('module'): |
| self._advance() |
| else: |
| raise ASDLSyntaxError( |
| 'Expected "module" (found {})'.format(self.cur_token.value), |
| self.cur_token.lineno) |
| name = self._match(self._id_kinds) |
| self._match(TokenKind.LBrace) |
| defs = self._parse_definitions() |
| self._match(TokenKind.RBrace) |
| return Module(name, defs) |
| |
| def _parse_definitions(self): |
| defs = [] |
| while self.cur_token.kind == TokenKind.TypeId: |
| typename = self._advance() |
| self._match(TokenKind.Equals) |
| type = self._parse_type() |
| defs.append(Type(typename, type)) |
| return defs |
| |
| def _parse_type(self): |
| if self.cur_token.kind == TokenKind.LParen: |
| # If we see a (, it's a product |
| return self._parse_product() |
| else: |
| # Otherwise it's a sum. Look for ConstructorId |
| sumlist = [Constructor(self._match(TokenKind.ConstructorId), |
| self._parse_optional_fields())] |
| while self.cur_token.kind == TokenKind.Pipe: |
| # More constructors |
| self._advance() |
| sumlist.append(Constructor( |
| self._match(TokenKind.ConstructorId), |
| self._parse_optional_fields())) |
| return Sum(sumlist, self._parse_optional_attributes()) |
| |
| def _parse_product(self): |
| return Product(self._parse_fields(), self._parse_optional_attributes()) |
| |
| def _parse_fields(self): |
| fields = [] |
| self._match(TokenKind.LParen) |
| while self.cur_token.kind == TokenKind.TypeId: |
| typename = self._advance() |
| is_seq, is_opt = self._parse_optional_field_quantifier() |
| id = (self._advance() if self.cur_token.kind in self._id_kinds |
| else None) |
| fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) |
| if self.cur_token.kind == TokenKind.RParen: |
| break |
| elif self.cur_token.kind == TokenKind.Comma: |
| self._advance() |
| self._match(TokenKind.RParen) |
| return fields |
| |
| def _parse_optional_fields(self): |
| if self.cur_token.kind == TokenKind.LParen: |
| return self._parse_fields() |
| else: |
| return None |
| |
| def _parse_optional_attributes(self): |
| if self._at_keyword('attributes'): |
| self._advance() |
| return self._parse_fields() |
| else: |
| return None |
| |
| def _parse_optional_field_quantifier(self): |
| is_seq, is_opt = False, False |
| if self.cur_token.kind == TokenKind.Asterisk: |
| is_seq = True |
| self._advance() |
| elif self.cur_token.kind == TokenKind.Question: |
| is_opt = True |
| self._advance() |
| return is_seq, is_opt |
| |
| def _advance(self): |
| """ Return the value of the current token and read the next one into |
| self.cur_token. |
| """ |
| cur_val = None if self.cur_token is None else self.cur_token.value |
| try: |
| self.cur_token = next(self._tokenizer) |
| except StopIteration: |
| self.cur_token = None |
| return cur_val |
| |
| _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) |
| |
| def _match(self, kind): |
| """The 'match' primitive of RD parsers. |
| |
| * Verifies that the current token is of the given kind (kind can |
| be a tuple, in which the kind must match one of its members). |
| * Returns the value of the current token |
| * Reads in the next token |
| """ |
| if (isinstance(kind, tuple) and self.cur_token.kind in kind or |
| self.cur_token.kind == kind |
| ): |
| value = self.cur_token.value |
| self._advance() |
| return value |
| else: |
| raise ASDLSyntaxError( |
| 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), |
| self.cur_token.lineno) |
| |
| def _at_keyword(self, keyword): |
| return (self.cur_token.kind == TokenKind.TypeId and |
| self.cur_token.value == keyword) |