Pablo Galindo | 71876fa | 2019-08-22 02:38:39 +0100 | [diff] [blame] | 1 | """Parser for the Python metagrammar""" |
| 2 | |
| 3 | import io |
| 4 | import tokenize # from stdlib |
| 5 | |
| 6 | from .automata import NFA, NFAState |
| 7 | |
| 8 | |
| 9 | class GrammarParser: |
| 10 | """Parser for Python grammar files.""" |
| 11 | |
| 12 | _translation_table = { |
| 13 | tokenize.NAME: "NAME", |
| 14 | tokenize.STRING: "STRING", |
| 15 | tokenize.NEWLINE: "NEWLINE", |
| 16 | tokenize.NL: "NL", |
| 17 | tokenize.OP: "OP", |
| 18 | tokenize.ENDMARKER: "ENDMARKER", |
| 19 | tokenize.COMMENT: "COMMENT", |
| 20 | } |
| 21 | |
| 22 | def __init__(self, grammar): |
| 23 | self.grammar = grammar |
| 24 | grammar_adaptor = io.StringIO(grammar) |
| 25 | self.generator = tokenize.generate_tokens(grammar_adaptor.readline) |
| 26 | self._gettoken() # Initialize lookahead |
| 27 | self._current_rule_name = None |
| 28 | |
| 29 | def parse(self): |
| 30 | """Turn the grammar into a collection of NFAs""" |
| 31 | # grammar: (NEWLINE | rule)* ENDMARKER |
| 32 | while self.type != tokenize.ENDMARKER: |
| 33 | while self.type == tokenize.NEWLINE: |
| 34 | self._gettoken() |
| 35 | # rule: NAME ':' rhs NEWLINE |
| 36 | self._current_rule_name = self._expect(tokenize.NAME) |
| 37 | self._expect(tokenize.OP, ":") |
| 38 | a, z = self._parse_rhs() |
| 39 | self._expect(tokenize.NEWLINE) |
| 40 | |
| 41 | yield NFA(a, z) |
| 42 | |
| 43 | def _parse_rhs(self): |
| 44 | # rhs: items ('|' items)* |
| 45 | a, z = self._parse_items() |
| 46 | if self.value != "|": |
| 47 | return a, z |
| 48 | else: |
| 49 | aa = NFAState(self._current_rule_name) |
| 50 | zz = NFAState(self._current_rule_name) |
| 51 | while True: |
| 52 | # Allow to transit directly to the previous state and connect the end of the |
| 53 | # previous state to the end of the current one, effectively allowing to skip |
| 54 | # the current state. |
| 55 | aa.add_arc(a) |
| 56 | z.add_arc(zz) |
| 57 | if self.value != "|": |
| 58 | break |
| 59 | |
| 60 | self._gettoken() |
| 61 | a, z = self._parse_items() |
| 62 | return aa, zz |
| 63 | |
| 64 | def _parse_items(self): |
| 65 | # items: item+ |
| 66 | a, b = self._parse_item() |
| 67 | while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): |
| 68 | c, d = self._parse_item() |
| 69 | # Allow a transition between the end of the previous state |
| 70 | # and the beginning of the new one, connecting all the items |
| 71 | # together. In this way we can only reach the end if we visit |
| 72 | # all the items. |
| 73 | b.add_arc(c) |
| 74 | b = d |
| 75 | return a, b |
| 76 | |
| 77 | def _parse_item(self): |
| 78 | # item: '[' rhs ']' | atom ['+' | '*'] |
| 79 | if self.value == "[": |
| 80 | self._gettoken() |
| 81 | a, z = self._parse_rhs() |
| 82 | self._expect(tokenize.OP, "]") |
| 83 | # Make a transition from the beginning to the end so it is possible to |
| 84 | # advance for free to the next state of this item # without consuming |
| 85 | # anything from the rhs. |
| 86 | a.add_arc(z) |
| 87 | return a, z |
| 88 | else: |
| 89 | a, z = self._parse_atom() |
| 90 | value = self.value |
| 91 | if value not in ("+", "*"): |
| 92 | return a, z |
| 93 | self._gettoken() |
| 94 | z.add_arc(a) |
| 95 | if value == "+": |
| 96 | # Create a cycle to the beginning so we go back to the old state in this |
| 97 | # item and repeat. |
| 98 | return a, z |
| 99 | else: |
| 100 | # The end state is the same as the beginning, so we can cycle arbitrarily |
| 101 | # and end in the beginning if necessary. |
| 102 | return a, a |
| 103 | |
| 104 | def _parse_atom(self): |
| 105 | # atom: '(' rhs ')' | NAME | STRING |
| 106 | if self.value == "(": |
| 107 | self._gettoken() |
| 108 | a, z = self._parse_rhs() |
| 109 | self._expect(tokenize.OP, ")") |
| 110 | return a, z |
| 111 | elif self.type in (tokenize.NAME, tokenize.STRING): |
| 112 | a = NFAState(self._current_rule_name) |
| 113 | z = NFAState(self._current_rule_name) |
| 114 | # We can transit to the next state only if we consume the value. |
| 115 | a.add_arc(z, self.value) |
| 116 | self._gettoken() |
| 117 | return a, z |
| 118 | else: |
| 119 | self._raise_error( |
| 120 | "expected (...) or NAME or STRING, got {} ({})", |
| 121 | self._translation_table.get(self.type, self.type), |
| 122 | self.value, |
| 123 | ) |
| 124 | |
| 125 | def _expect(self, type_, value=None): |
| 126 | if self.type != type_: |
| 127 | self._raise_error( |
| 128 | "expected {}, got {} ({})", |
| 129 | self._translation_table.get(type_, type_), |
| 130 | self._translation_table.get(self.type, self.type), |
| 131 | self.value, |
| 132 | ) |
| 133 | if value is not None and self.value != value: |
| 134 | self._raise_error("expected {}, got {}", value, self.value) |
| 135 | value = self.value |
| 136 | self._gettoken() |
| 137 | return value |
| 138 | |
| 139 | def _gettoken(self): |
| 140 | tup = next(self.generator) |
| 141 | while tup[0] in (tokenize.COMMENT, tokenize.NL): |
| 142 | tup = next(self.generator) |
| 143 | self.type, self.value, self.begin, self.end, self.line = tup |
| 144 | |
| 145 | def _raise_error(self, msg, *args): |
| 146 | if args: |
| 147 | try: |
| 148 | msg = msg.format(*args) |
| 149 | except Exception: |
| 150 | msg = " ".join([msg] + list(map(str, args))) |
| 151 | line = self.grammar.splitlines()[self.begin[0] - 1] |
| 152 | raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line)) |