blob: 34d9b71edce8f589aeb061b90b4b3826b4e1f0d5 [file] [log] [blame]
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001/* Parser generator */
2
3/* For a description, see the comments at end of this file */
4
5#include <stdio.h>
6#include "assert.h"
7
8#include "PROTO.h"
9#include "malloc.h"
10#include "token.h"
11#include "node.h"
12#include "grammar.h"
13#include "metagrammar.h"
14#include "pgen.h"
15
16extern int debugging;
17
18
19/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
20
21typedef struct _nfaarc {
22 int ar_label;
23 int ar_arrow;
24} nfaarc;
25
26typedef struct _nfastate {
27 int st_narcs;
28 nfaarc *st_arc;
29} nfastate;
30
31typedef struct _nfa {
32 int nf_type;
33 char *nf_name;
34 int nf_nstates;
35 nfastate *nf_state;
36 int nf_start, nf_finish;
37} nfa;
38
39static int
40addnfastate(nf)
41 nfa *nf;
42{
43 nfastate *st;
44
45 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
46 if (nf->nf_state == NULL)
47 fatal("out of mem");
48 st = &nf->nf_state[nf->nf_nstates++];
49 st->st_narcs = 0;
50 st->st_arc = NULL;
51 return st - nf->nf_state;
52}
53
54static void
55addnfaarc(nf, from, to, lbl)
56 nfa *nf;
57 int from, to, lbl;
58{
59 nfastate *st;
60 nfaarc *ar;
61
62 st = &nf->nf_state[from];
63 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
64 if (st->st_arc == NULL)
65 fatal("out of mem");
66 ar = &st->st_arc[st->st_narcs++];
67 ar->ar_label = lbl;
68 ar->ar_arrow = to;
69}
70
71static nfa *
72newnfa(name)
73 char *name;
74{
75 nfa *nf;
76 static type = NT_OFFSET; /* All types will be disjunct */
77
78 nf = NEW(nfa, 1);
79 if (nf == NULL)
80 fatal("no mem for new nfa");
81 nf->nf_type = type++;
82 nf->nf_name = name; /* XXX strdup(name) ??? */
83 nf->nf_nstates = 0;
84 nf->nf_state = NULL;
85 nf->nf_start = nf->nf_finish = -1;
86 return nf;
87}
88
89typedef struct _nfagrammar {
90 int gr_nnfas;
91 nfa **gr_nfa;
92 labellist gr_ll;
93} nfagrammar;
94
95static nfagrammar *
96newnfagrammar()
97{
98 nfagrammar *gr;
99
100 gr = NEW(nfagrammar, 1);
101 if (gr == NULL)
102 fatal("no mem for new nfa grammar");
103 gr->gr_nnfas = 0;
104 gr->gr_nfa = NULL;
105 gr->gr_ll.ll_nlabels = 0;
106 gr->gr_ll.ll_label = NULL;
107 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
108 return gr;
109}
110
111static nfa *
112addnfa(gr, name)
113 nfagrammar *gr;
114 char *name;
115{
116 nfa *nf;
117
118 nf = newnfa(name);
119 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
120 if (gr->gr_nfa == NULL)
121 fatal("out of mem");
122 gr->gr_nfa[gr->gr_nnfas++] = nf;
123 addlabel(&gr->gr_ll, NAME, nf->nf_name);
124 return nf;
125}
126
127#ifdef DEBUG
128
129static char REQNFMT[] = "metacompile: less than %d children\n";
130
131#define REQN(i, count) \
132 if (i < count) { \
133 fprintf(stderr, REQNFMT, count); \
134 abort(); \
135 } else
136
137#else
138#define REQN(i, count) /* empty */
139#endif
140
141static nfagrammar *
142metacompile(n)
143 node *n;
144{
145 nfagrammar *gr;
146 int i;
147
148 printf("Compiling (meta-) parse tree into NFA grammar\n");
149 gr = newnfagrammar();
150 REQ(n, MSTART);
151 i = n->n_nchildren - 1; /* Last child is ENDMARKER */
152 n = n->n_child;
153 for (; --i >= 0; n++) {
154 if (n->n_type != NEWLINE)
155 compile_rule(gr, n);
156 }
157 return gr;
158}
159
160static
161compile_rule(gr, n)
162 nfagrammar *gr;
163 node *n;
164{
165 nfa *nf;
166
167 REQ(n, RULE);
168 REQN(n->n_nchildren, 4);
169 n = n->n_child;
170 REQ(n, NAME);
171 nf = addnfa(gr, n->n_str);
172 n++;
173 REQ(n, COLON);
174 n++;
175 REQ(n, RHS);
176 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
177 n++;
178 REQ(n, NEWLINE);
179}
180
181static
182compile_rhs(ll, nf, n, pa, pb)
183 labellist *ll;
184 nfa *nf;
185 node *n;
186 int *pa, *pb;
187{
188 int i;
189 int a, b;
190
191 REQ(n, RHS);
192 i = n->n_nchildren;
193 REQN(i, 1);
194 n = n->n_child;
195 REQ(n, ALT);
196 compile_alt(ll, nf, n, pa, pb);
197 if (--i <= 0)
198 return;
199 n++;
200 a = *pa;
201 b = *pb;
202 *pa = addnfastate(nf);
203 *pb = addnfastate(nf);
204 addnfaarc(nf, *pa, a, EMPTY);
205 addnfaarc(nf, b, *pb, EMPTY);
206 for (; --i >= 0; n++) {
207 REQ(n, VBAR);
208 REQN(i, 1);
209 --i;
210 n++;
211 REQ(n, ALT);
212 compile_alt(ll, nf, n, &a, &b);
213 addnfaarc(nf, *pa, a, EMPTY);
214 addnfaarc(nf, b, *pb, EMPTY);
215 }
216}
217
218static
219compile_alt(ll, nf, n, pa, pb)
220 labellist *ll;
221 nfa *nf;
222 node *n;
223 int *pa, *pb;
224{
225 int i;
226 int a, b;
227
228 REQ(n, ALT);
229 i = n->n_nchildren;
230 REQN(i, 1);
231 n = n->n_child;
232 REQ(n, ITEM);
233 compile_item(ll, nf, n, pa, pb);
234 --i;
235 n++;
236 for (; --i >= 0; n++) {
237 if (n->n_type == COMMA) { /* XXX Temporary */
238 REQN(i, 1);
239 --i;
240 n++;
241 }
242 REQ(n, ITEM);
243 compile_item(ll, nf, n, &a, &b);
244 addnfaarc(nf, *pb, a, EMPTY);
245 *pb = b;
246 }
247}
248
249static
250compile_item(ll, nf, n, pa, pb)
251 labellist *ll;
252 nfa *nf;
253 node *n;
254 int *pa, *pb;
255{
256 int i;
257 int a, b;
258
259 REQ(n, ITEM);
260 i = n->n_nchildren;
261 REQN(i, 1);
262 n = n->n_child;
263 if (n->n_type == LSQB) {
264 REQN(i, 3);
265 n++;
266 REQ(n, RHS);
267 *pa = addnfastate(nf);
268 *pb = addnfastate(nf);
269 addnfaarc(nf, *pa, *pb, EMPTY);
270 compile_rhs(ll, nf, n, &a, &b);
271 addnfaarc(nf, *pa, a, EMPTY);
272 addnfaarc(nf, b, *pb, EMPTY);
273 REQN(i, 1);
274 n++;
275 REQ(n, RSQB);
276 }
277 else {
278 compile_atom(ll, nf, n, pa, pb);
279 if (--i <= 0)
280 return;
281 n++;
282 addnfaarc(nf, *pb, *pa, EMPTY);
283 if (n->n_type == STAR)
284 *pb = *pa;
285 else
286 REQ(n, PLUS);
287 }
288}
289
290static
291compile_atom(ll, nf, n, pa, pb)
292 labellist *ll;
293 nfa *nf;
294 node *n;
295 int *pa, *pb;
296{
297 int i;
298
299 REQ(n, ATOM);
300 i = n->n_nchildren;
301 REQN(i, 1);
302 n = n->n_child;
303 if (n->n_type == LPAR) {
304 REQN(i, 3);
305 n++;
306 REQ(n, RHS);
307 compile_rhs(ll, nf, n, pa, pb);
308 n++;
309 REQ(n, RPAR);
310 }
311 else if (n->n_type == NAME || n->n_type == STRING) {
312 *pa = addnfastate(nf);
313 *pb = addnfastate(nf);
314 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
315 }
316 else
317 REQ(n, NAME);
318}
319
320static void
321dumpstate(ll, nf, istate)
322 labellist *ll;
323 nfa *nf;
324 int istate;
325{
326 nfastate *st;
327 int i;
328 nfaarc *ar;
329
330 printf("%c%2d%c",
331 istate == nf->nf_start ? '*' : ' ',
332 istate,
333 istate == nf->nf_finish ? '.' : ' ');
334 st = &nf->nf_state[istate];
335 ar = st->st_arc;
336 for (i = 0; i < st->st_narcs; i++) {
337 if (i > 0)
338 printf("\n ");
339 printf("-> %2d %s", ar->ar_arrow,
340 labelrepr(&ll->ll_label[ar->ar_label]));
341 ar++;
342 }
343 printf("\n");
344}
345
346static void
347dumpnfa(ll, nf)
348 labellist *ll;
349 nfa *nf;
350{
351 int i;
352
353 printf("NFA '%s' has %d states; start %d, finish %d\n",
354 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
355 for (i = 0; i < nf->nf_nstates; i++)
356 dumpstate(ll, nf, i);
357}
358
359
360/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
361
362static int
363addclosure(ss, nf, istate)
364 bitset ss;
365 nfa *nf;
366 int istate;
367{
368 if (addbit(ss, istate)) {
369 nfastate *st = &nf->nf_state[istate];
370 nfaarc *ar = st->st_arc;
371 int i;
372
373 for (i = st->st_narcs; --i >= 0; ) {
374 if (ar->ar_label == EMPTY)
375 addclosure(ss, nf, ar->ar_arrow);
376 ar++;
377 }
378 }
379}
380
381typedef struct _ss_arc {
382 bitset sa_bitset;
383 int sa_arrow;
384 int sa_label;
385} ss_arc;
386
387typedef struct _ss_state {
388 bitset ss_ss;
389 int ss_narcs;
390 ss_arc *ss_arc;
391 int ss_deleted;
392 int ss_finish;
393 int ss_rename;
394} ss_state;
395
396typedef struct _ss_dfa {
397 int sd_nstates;
398 ss_state *sd_state;
399} ss_dfa;
400
401static
402makedfa(gr, nf, d)
403 nfagrammar *gr;
404 nfa *nf;
405 dfa *d;
406{
407 int nbits = nf->nf_nstates;
408 bitset ss;
409 int xx_nstates;
410 ss_state *xx_state, *yy;
411 ss_arc *zz;
412 int istate, jstate, iarc, jarc, ibit;
413 nfastate *st;
414 nfaarc *ar;
415
416 ss = newbitset(nbits);
417 addclosure(ss, nf, nf->nf_start);
418 xx_state = NEW(ss_state, 1);
419 if (xx_state == NULL)
420 fatal("no mem for xx_state in makedfa");
421 xx_nstates = 1;
422 yy = &xx_state[0];
423 yy->ss_ss = ss;
424 yy->ss_narcs = 0;
425 yy->ss_arc = NULL;
426 yy->ss_deleted = 0;
427 yy->ss_finish = testbit(ss, nf->nf_finish);
428 if (yy->ss_finish)
429 printf("Error: nonterminal '%s' may produce empty.\n",
430 nf->nf_name);
431
432 /* This algorithm is from a book written before
433 the invention of structured programming... */
434
435 /* For each unmarked state... */
436 for (istate = 0; istate < xx_nstates; ++istate) {
437 yy = &xx_state[istate];
438 ss = yy->ss_ss;
439 /* For all its states... */
440 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
441 if (!testbit(ss, ibit))
442 continue;
443 st = &nf->nf_state[ibit];
444 /* For all non-empty arcs from this state... */
445 for (iarc = 0; iarc < st->st_narcs; iarc++) {
446 ar = &st->st_arc[iarc];
447 if (ar->ar_label == EMPTY)
448 continue;
449 /* Look up in list of arcs from this state */
450 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
451 zz = &yy->ss_arc[jarc];
452 if (ar->ar_label == zz->sa_label)
453 goto found;
454 }
455 /* Add new arc for this state */
456 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
457 if (yy->ss_arc == NULL)
458 fatal("out of mem");
459 zz = &yy->ss_arc[yy->ss_narcs++];
460 zz->sa_label = ar->ar_label;
461 zz->sa_bitset = newbitset(nbits);
462 zz->sa_arrow = -1;
463 found: ;
464 /* Add destination */
465 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
466 }
467 }
468 /* Now look up all the arrow states */
469 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
470 zz = &xx_state[istate].ss_arc[jarc];
471 for (jstate = 0; jstate < xx_nstates; jstate++) {
472 if (samebitset(zz->sa_bitset,
473 xx_state[jstate].ss_ss, nbits)) {
474 zz->sa_arrow = jstate;
475 goto done;
476 }
477 }
478 RESIZE(xx_state, ss_state, xx_nstates + 1);
479 if (xx_state == NULL)
480 fatal("out of mem");
481 zz->sa_arrow = xx_nstates;
482 yy = &xx_state[xx_nstates++];
483 yy->ss_ss = zz->sa_bitset;
484 yy->ss_narcs = 0;
485 yy->ss_arc = NULL;
486 yy->ss_deleted = 0;
487 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
488 done: ;
489 }
490 }
491
492 if (debugging)
493 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
494 "before minimizing");
495
496 simplify(xx_nstates, xx_state);
497
498 if (debugging)
499 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
500 "after minimizing");
501
502 convert(d, xx_nstates, xx_state);
503
504 /* XXX cleanup */
505}
506
507static
508printssdfa(xx_nstates, xx_state, nbits, ll, msg)
509 int xx_nstates;
510 ss_state *xx_state;
511 int nbits;
512 labellist *ll;
513 char *msg;
514{
515 int i, ibit, iarc;
516 ss_state *yy;
517 ss_arc *zz;
518
519 printf("Subset DFA %s\n", msg);
520 for (i = 0; i < xx_nstates; i++) {
521 yy = &xx_state[i];
522 if (yy->ss_deleted)
523 continue;
524 printf(" Subset %d", i);
525 if (yy->ss_finish)
526 printf(" (finish)");
527 printf(" { ");
528 for (ibit = 0; ibit < nbits; ibit++) {
529 if (testbit(yy->ss_ss, ibit))
530 printf("%d ", ibit);
531 }
532 printf("}\n");
533 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
534 zz = &yy->ss_arc[iarc];
535 printf(" Arc to state %d, label %s\n",
536 zz->sa_arrow,
537 labelrepr(&ll->ll_label[zz->sa_label]));
538 }
539 }
540}
541
542
543/* PART THREE -- SIMPLIFY DFA */
544
545/* Simplify the DFA by repeatedly eliminating states that are
546 equivalent to another oner. This is NOT Algorithm 3.3 from
547 [Aho&Ullman 77]. It does not always finds the minimal DFA,
548 but it does usually make a much smaller one... (For an example
549 of sub-optimal behaviour, try S: x a b+ | y a b+.)
550*/
551
552static int
553samestate(s1, s2)
554 ss_state *s1, *s2;
555{
556 int i;
557
558 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
559 return 0;
560 for (i = 0; i < s1->ss_narcs; i++) {
561 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
562 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
563 return 0;
564 }
565 return 1;
566}
567
568static void
569renamestates(xx_nstates, xx_state, from, to)
570 int xx_nstates;
571 ss_state *xx_state;
572 int from, to;
573{
574 int i, j;
575
576 if (debugging)
577 printf("Rename state %d to %d.\n", from, to);
578 for (i = 0; i < xx_nstates; i++) {
579 if (xx_state[i].ss_deleted)
580 continue;
581 for (j = 0; j < xx_state[i].ss_narcs; j++) {
582 if (xx_state[i].ss_arc[j].sa_arrow == from)
583 xx_state[i].ss_arc[j].sa_arrow = to;
584 }
585 }
586}
587
588static
589simplify(xx_nstates, xx_state)
590 int xx_nstates;
591 ss_state *xx_state;
592{
593 int changes;
594 int i, j, k;
595
596 do {
597 changes = 0;
598 for (i = 1; i < xx_nstates; i++) {
599 if (xx_state[i].ss_deleted)
600 continue;
601 for (j = 0; j < i; j++) {
602 if (xx_state[j].ss_deleted)
603 continue;
604 if (samestate(&xx_state[i], &xx_state[j])) {
605 xx_state[i].ss_deleted++;
606 renamestates(xx_nstates, xx_state, i, j);
607 changes++;
608 break;
609 }
610 }
611 }
612 } while (changes);
613}
614
615
616/* PART FOUR -- GENERATE PARSING TABLES */
617
618/* Convert the DFA into a grammar that can be used by our parser */
619
620static
621convert(d, xx_nstates, xx_state)
622 dfa *d;
623 int xx_nstates;
624 ss_state *xx_state;
625{
626 int i, j;
627 ss_state *yy;
628 ss_arc *zz;
629
630 for (i = 0; i < xx_nstates; i++) {
631 yy = &xx_state[i];
632 if (yy->ss_deleted)
633 continue;
634 yy->ss_rename = addstate(d);
635 }
636
637 for (i = 0; i < xx_nstates; i++) {
638 yy = &xx_state[i];
639 if (yy->ss_deleted)
640 continue;
641 for (j = 0; j < yy->ss_narcs; j++) {
642 zz = &yy->ss_arc[j];
643 addarc(d, yy->ss_rename,
644 xx_state[zz->sa_arrow].ss_rename,
645 zz->sa_label);
646 }
647 if (yy->ss_finish)
648 addarc(d, yy->ss_rename, yy->ss_rename, 0);
649 }
650
651 d->d_initial = 0;
652}
653
654
655/* PART FIVE -- GLUE IT ALL TOGETHER */
656
657static grammar *
658maketables(gr)
659 nfagrammar *gr;
660{
661 int i;
662 nfa *nf;
663 dfa *d;
664 grammar *g;
665
666 if (gr->gr_nnfas == 0)
667 return NULL;
668 g = newgrammar(gr->gr_nfa[0]->nf_type);
669 /* XXX first rule must be start rule */
670 g->g_ll = gr->gr_ll;
671
672 for (i = 0; i < gr->gr_nnfas; i++) {
673 nf = gr->gr_nfa[i];
674 if (debugging) {
675 printf("Dump of NFA for '%s' ...\n", nf->nf_name);
676 dumpnfa(&gr->gr_ll, nf);
677 }
678 printf("Making DFA for '%s' ...\n", nf->nf_name);
679 d = adddfa(g, nf->nf_type, nf->nf_name);
680 makedfa(gr, gr->gr_nfa[i], d);
681 }
682
683 return g;
684}
685
686grammar *
687pgen(n)
688 node *n;
689{
690 nfagrammar *gr;
691 grammar *g;
692
693 gr = metacompile(n);
694 g = maketables(gr);
695 translatelabels(g);
696 addfirstsets(g);
697 return g;
698}
699
700
701/*
702
703Description
704-----------
705
706Input is a grammar in extended BNF (using * for repetition, + for
707at-least-once repetition, [] for optional parts, | for alternatives and
708() for grouping). This has already been parsed and turned into a parse
709tree.
710
711Each rule is considered as a regular expression in its own right.
712It is turned into a Non-deterministic Finite Automaton (NFA), which
713is then turned into a Deterministic Finite Automaton (DFA), which is then
714optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
715or similar compiler books (this technique is more often used for lexical
716analyzers).
717
718The DFA's are used by the parser as parsing tables in a special way
719that's probably unique. Before they are usable, the FIRST sets of all
720non-terminals are computed.
721
722Reference
723---------
724
725[Aho&Ullman 77]
726 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
727 (first edition)
728
729*/