blob: 074a083fb74b8b73592777f35f573b3a0b9de098 [file] [log] [blame]
Pablo Galindo71876fa2019-08-22 02:38:39 +01001"""Parser for the Python metagrammar"""
2
3import io
4import tokenize # from stdlib
5
6from .automata import NFA, NFAState
7
8
9class GrammarParser:
10 """Parser for Python grammar files."""
11
12 _translation_table = {
13 tokenize.NAME: "NAME",
14 tokenize.STRING: "STRING",
15 tokenize.NEWLINE: "NEWLINE",
16 tokenize.NL: "NL",
17 tokenize.OP: "OP",
18 tokenize.ENDMARKER: "ENDMARKER",
19 tokenize.COMMENT: "COMMENT",
20 }
21
22 def __init__(self, grammar):
23 self.grammar = grammar
24 grammar_adaptor = io.StringIO(grammar)
25 self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
26 self._gettoken() # Initialize lookahead
27 self._current_rule_name = None
28
29 def parse(self):
30 """Turn the grammar into a collection of NFAs"""
31 # grammar: (NEWLINE | rule)* ENDMARKER
32 while self.type != tokenize.ENDMARKER:
33 while self.type == tokenize.NEWLINE:
34 self._gettoken()
35 # rule: NAME ':' rhs NEWLINE
36 self._current_rule_name = self._expect(tokenize.NAME)
37 self._expect(tokenize.OP, ":")
38 a, z = self._parse_rhs()
39 self._expect(tokenize.NEWLINE)
40
41 yield NFA(a, z)
42
43 def _parse_rhs(self):
44 # rhs: items ('|' items)*
45 a, z = self._parse_items()
46 if self.value != "|":
47 return a, z
48 else:
49 aa = NFAState(self._current_rule_name)
50 zz = NFAState(self._current_rule_name)
51 while True:
52 # Allow to transit directly to the previous state and connect the end of the
53 # previous state to the end of the current one, effectively allowing to skip
54 # the current state.
55 aa.add_arc(a)
56 z.add_arc(zz)
57 if self.value != "|":
58 break
59
60 self._gettoken()
61 a, z = self._parse_items()
62 return aa, zz
63
64 def _parse_items(self):
65 # items: item+
66 a, b = self._parse_item()
67 while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
68 c, d = self._parse_item()
69 # Allow a transition between the end of the previous state
70 # and the beginning of the new one, connecting all the items
71 # together. In this way we can only reach the end if we visit
72 # all the items.
73 b.add_arc(c)
74 b = d
75 return a, b
76
77 def _parse_item(self):
78 # item: '[' rhs ']' | atom ['+' | '*']
79 if self.value == "[":
80 self._gettoken()
81 a, z = self._parse_rhs()
82 self._expect(tokenize.OP, "]")
83 # Make a transition from the beginning to the end so it is possible to
84 # advance for free to the next state of this item # without consuming
85 # anything from the rhs.
86 a.add_arc(z)
87 return a, z
88 else:
89 a, z = self._parse_atom()
90 value = self.value
91 if value not in ("+", "*"):
92 return a, z
93 self._gettoken()
94 z.add_arc(a)
95 if value == "+":
96 # Create a cycle to the beginning so we go back to the old state in this
97 # item and repeat.
98 return a, z
99 else:
100 # The end state is the same as the beginning, so we can cycle arbitrarily
101 # and end in the beginning if necessary.
102 return a, a
103
104 def _parse_atom(self):
105 # atom: '(' rhs ')' | NAME | STRING
106 if self.value == "(":
107 self._gettoken()
108 a, z = self._parse_rhs()
109 self._expect(tokenize.OP, ")")
110 return a, z
111 elif self.type in (tokenize.NAME, tokenize.STRING):
112 a = NFAState(self._current_rule_name)
113 z = NFAState(self._current_rule_name)
114 # We can transit to the next state only if we consume the value.
115 a.add_arc(z, self.value)
116 self._gettoken()
117 return a, z
118 else:
119 self._raise_error(
120 "expected (...) or NAME or STRING, got {} ({})",
121 self._translation_table.get(self.type, self.type),
122 self.value,
123 )
124
125 def _expect(self, type_, value=None):
126 if self.type != type_:
127 self._raise_error(
128 "expected {}, got {} ({})",
129 self._translation_table.get(type_, type_),
130 self._translation_table.get(self.type, self.type),
131 self.value,
132 )
133 if value is not None and self.value != value:
134 self._raise_error("expected {}, got {}", value, self.value)
135 value = self.value
136 self._gettoken()
137 return value
138
139 def _gettoken(self):
140 tup = next(self.generator)
141 while tup[0] in (tokenize.COMMENT, tokenize.NL):
142 tup = next(self.generator)
143 self.type, self.value, self.begin, self.end, self.line = tup
144
145 def _raise_error(self, msg, *args):
146 if args:
147 try:
148 msg = msg.format(*args)
149 except Exception:
150 msg = " ".join([msg] + list(map(str, args)))
151 line = self.grammar.splitlines()[self.begin[0] - 1]
152 raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))