blob: e0b3530f511b5b610d46ec5d194c175c2644d9c0 [file] [log] [blame]
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001/* Parser implementation */
2
3/* For a description, see the comments at end of this file */
4
5/* XXX To do: error recovery */
6
7#include <stdio.h>
8#include "assert.h"
9
10#include "PROTO.h"
11#include "malloc.h"
12#include "token.h"
13#include "grammar.h"
14#include "node.h"
15#include "parser.h"
16#include "errcode.h"
17
18extern int debugging;
19
20#ifdef DEBUG
21#define D(x) if (!debugging); else x
22#else
23#define D(x)
24#endif
25
26
27/* STACK DATA TYPE */
28
29static void s_reset PROTO((stack *));
30
31static void
32s_reset(s)
33 stack *s;
34{
35 s->s_top = &s->s_base[MAXSTACK];
36}
37
38#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
39
40static int s_push PROTO((stack *, dfa *, node *));
41
42static int
43s_push(s, d, parent)
44 register stack *s;
45 dfa *d;
46 node *parent;
47{
48 register stackentry *top;
49 if (s->s_top == s->s_base) {
50 fprintf(stderr, "s_push: parser stack overflow\n");
51 return -1;
52 }
53 top = --s->s_top;
54 top->s_dfa = d;
55 top->s_parent = parent;
56 top->s_state = 0;
57 return 0;
58}
59
60#ifdef DEBUG
61
62static void s_pop PROTO((stack *));
63
64static void
65s_pop(s)
66 register stack *s;
67{
68 if (s_empty(s)) {
69 fprintf(stderr, "s_pop: parser stack underflow -- FATAL\n");
70 abort();
71 }
72 s->s_top++;
73}
74
75#else /* !DEBUG */
76
77#define s_pop(s) (s)->s_top++
78
79#endif
80
81
82/* PARSER CREATION */
83
84parser_state *
85newparser(g, start)
86 grammar *g;
87 int start;
88{
89 parser_state *ps;
90
91 if (!g->g_accel)
92 addaccelerators(g);
93 ps = NEW(parser_state, 1);
94 if (ps == NULL)
95 return NULL;
96 ps->p_grammar = g;
97 ps->p_tree = newnode(start);
98 if (ps->p_tree == NULL) {
99 if (ps->p_tree != NULL)
100 DEL(ps->p_tree); /* XXX freeing a node!?! */
101 DEL(ps);
102 return NULL;
103 }
104 s_reset(&ps->p_stack);
105 (void) s_push(&ps->p_stack, finddfa(g, start), ps->p_tree);
106 return ps;
107}
108
109void
110delparser(ps)
111 parser_state *ps;
112{
113 DEL(ps);
114}
115
116
117/* PARSER STACK OPERATIONS */
118
119static int shift PROTO((stack *, int, char *, int));
120
121static int
122shift(s, type, str, newstate)
123 register stack *s;
124 int type;
125 char *str;
126 int newstate;
127{
128 assert(!s_empty(s));
129 if (addchild(s->s_top->s_parent, type, str) == NULL) {
130 fprintf(stderr, "shift: no mem in addchild\n");
131 return -1;
132 }
133 s->s_top->s_state = newstate;
134 return 0;
135}
136
137static int push PROTO((stack *, int, dfa *, int));
138
139static int
140push(s, type, d, newstate)
141 register stack *s;
142 int type;
143 dfa *d;
144 int newstate;
145{
146 register node *n;
147 n = s->s_top->s_parent;
148 assert(!s_empty(s));
149 if (addchild(n, type, (char *)NULL) == NULL) {
150 fprintf(stderr, "push: no mem in addchild\n");
151 return -1;
152 }
153 s->s_top->s_state = newstate;
154 return s_push(s, d, CHILD(n, NCH(n)-1));
155}
156
157
158/* PARSER PROPER */
159
160static int classify PROTO((grammar *, int, char *));
161
162static int
163classify(g, type, str)
164 grammar *g;
165 register int type;
166 char *str;
167{
168 register int n = g->g_ll.ll_nlabels;
169
170 if (type == NAME) {
171 register char *s = str;
172 register label *l = g->g_ll.ll_label;
173 register int i;
174 for (i = n; i > 0; i--, l++) {
175 if (l->lb_type == NAME && l->lb_str != NULL &&
176 l->lb_str[0] == s[0] &&
177 strcmp(l->lb_str, s) == 0) {
178 D(printf("It's a keyword\n"));
179 return n - i;
180 }
181 }
182 }
183
184 {
185 register label *l = g->g_ll.ll_label;
186 register int i;
187 for (i = n; i > 0; i--, l++) {
188 if (l->lb_type == type && l->lb_str == NULL) {
189 D(printf("It's a token we know\n"));
190 return n - i;
191 }
192 }
193 }
194
195 D(printf("Illegal token\n"));
196 return -1;
197}
198
199int
200addtoken(ps, type, str)
201 register parser_state *ps;
202 register int type;
203 char *str;
204{
205 register int ilabel;
206
207 D(printf("Token %s/'%s' ... ", tok_name[type], str));
208
209 /* Find out which label this token is */
210 ilabel = classify(ps->p_grammar, type, str);
211 if (ilabel < 0)
212 return E_SYNTAX;
213
214 /* Loop until the token is shifted or an error occurred */
215 for (;;) {
216 /* Fetch the current dfa and state */
217 register dfa *d = ps->p_stack.s_top->s_dfa;
218 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
219
220 D(printf(" DFA '%s', state %d:",
221 d->d_name, ps->p_stack.s_top->s_state));
222
223 /* Check accelerator */
224 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
225 register int x = s->s_accel[ilabel - s->s_lower];
226 if (x != -1) {
227 if (x & (1<<7)) {
228 /* Push non-terminal */
229 int nt = (x >> 8) + NT_OFFSET;
230 int arrow = x & ((1<<7)-1);
231 dfa *d1 = finddfa(ps->p_grammar, nt);
232 if (push(&ps->p_stack, nt, d1, arrow) < 0) {
233 D(printf(" MemError: push.\n"));
234 return E_NOMEM;
235 }
236 D(printf(" Push ...\n"));
237 continue;
238 }
239
240 /* Shift the token */
241 if (shift(&ps->p_stack, type, str, x) < 0) {
242 D(printf(" MemError: shift.\n"));
243 return E_NOMEM;
244 }
245 D(printf(" Shift.\n"));
246 /* Pop while we are in an accept-only state */
247 while (s = &d->d_state
248 [ps->p_stack.s_top->s_state],
249 s->s_accept && s->s_narcs == 1) {
250 D(printf(" Direct pop.\n"));
251 s_pop(&ps->p_stack);
252 if (s_empty(&ps->p_stack)) {
253 D(printf(" ACCEPT.\n"));
254 return E_DONE;
255 }
256 d = ps->p_stack.s_top->s_dfa;
257 }
258 return E_OK;
259 }
260 }
261
262 if (s->s_accept) {
263 /* Pop this dfa and try again */
264 s_pop(&ps->p_stack);
265 D(printf(" Pop ...\n"));
266 if (s_empty(&ps->p_stack)) {
267 D(printf(" Error: bottom of stack.\n"));
268 return E_SYNTAX;
269 }
270 continue;
271 }
272
273 /* Stuck, report syntax error */
274 D(printf(" Error.\n"));
275 return E_SYNTAX;
276 }
277}
278
279
280#ifdef DEBUG
281
282/* DEBUG OUTPUT */
283
284void
285dumptree(g, n)
286 grammar *g;
287 node *n;
288{
289 int i;
290
291 if (n == NULL)
292 printf("NIL");
293 else {
294 label l;
295 l.lb_type = TYPE(n);
296 l.lb_str = TYPE(str);
297 printf("%s", labelrepr(&l));
298 if (ISNONTERMINAL(TYPE(n))) {
299 printf("(");
300 for (i = 0; i < NCH(n); i++) {
301 if (i > 0)
302 printf(",");
303 dumptree(g, CHILD(n, i));
304 }
305 printf(")");
306 }
307 }
308}
309
310void
311showtree(g, n)
312 grammar *g;
313 node *n;
314{
315 int i;
316
317 if (n == NULL)
318 return;
319 if (ISNONTERMINAL(TYPE(n))) {
320 for (i = 0; i < NCH(n); i++)
321 showtree(g, CHILD(n, i));
322 }
323 else if (ISTERMINAL(TYPE(n))) {
324 printf("%s", tok_name[TYPE(n)]);
325 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
326 printf("(%s)", STR(n));
327 printf(" ");
328 }
329 else
330 printf("? ");
331}
332
333void
334printtree(ps)
335 parser_state *ps;
336{
337 if (debugging) {
338 printf("Parse tree:\n");
339 dumptree(ps->p_grammar, ps->p_tree);
340 printf("\n");
341 printf("Tokens:\n");
342 showtree(ps->p_grammar, ps->p_tree);
343 printf("\n");
344 }
345 printf("Listing:\n");
346 listtree(ps->p_tree);
347 printf("\n");
348}
349
350#endif /* DEBUG */
351
352/*
353
354Description
355-----------
356
357The parser's interface is different than usual: the function addtoken()
358must be called for each token in the input. This makes it possible to
359turn it into an incremental parsing system later. The parsing system
360constructs a parse tree as it goes.
361
362A parsing rule is represented as a Deterministic Finite-state Automaton
363(DFA). A node in a DFA represents a state of the parser; an arc represents
364a transition. Transitions are either labeled with terminal symbols or
365with non-terminals. When the parser decides to follow an arc labeled
366with a non-terminal, it is invoked recursively with the DFA representing
367the parsing rule for that as its initial state; when that DFA accepts,
368the parser that invoked it continues. The parse tree constructed by the
369recursively called parser is inserted as a child in the current parse tree.
370
371The DFA's can be constructed automatically from a more conventional
372language description. An extended LL(1) grammar (ELL(1)) is suitable.
373Certain restrictions make the parser's life easier: rules that can produce
374the empty string should be outlawed (there are other ways to put loops
375or optional parts in the language). To avoid the need to construct
376FIRST sets, we can require that all but the last alternative of a rule
377(really: arc going out of a DFA's state) must begin with a terminal
378symbol.
379
380As an example, consider this grammar:
381
382expr: term (OP term)*
383term: CONSTANT | '(' expr ')'
384
385The DFA corresponding to the rule for expr is:
386
387------->.---term-->.------->
388 ^ |
389 | |
390 \----OP----/
391
392The parse tree generated for the input a+b is:
393
394(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
395
396*/