blob: bd1610ee7a3d47a77c24021244d7e4edb96a697f [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Parser generator */
Guido van Rossum3f5da241990-12-20 15:06:42 +000026/* XXX This file is not yet fully PROTOized */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027
28/* For a description, see the comments at end of this file */
29
Guido van Rossum3f5da241990-12-20 15:06:42 +000030#include "pgenheaders.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000031#include "assert.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032#include "token.h"
33#include "node.h"
34#include "grammar.h"
35#include "metagrammar.h"
36#include "pgen.h"
37
38extern int debugging;
39
40
41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
42
43typedef struct _nfaarc {
44 int ar_label;
45 int ar_arrow;
46} nfaarc;
47
48typedef struct _nfastate {
49 int st_narcs;
50 nfaarc *st_arc;
51} nfastate;
52
53typedef struct _nfa {
54 int nf_type;
55 char *nf_name;
56 int nf_nstates;
57 nfastate *nf_state;
58 int nf_start, nf_finish;
59} nfa;
60
Guido van Rossum71f477c1991-04-03 19:09:02 +000061/* Forward */
62static compile_rhs PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
63static compile_alt PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
64static compile_item PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
65static compile_atom PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
66
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000067static int
68addnfastate(nf)
69 nfa *nf;
70{
71 nfastate *st;
72
73 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
74 if (nf->nf_state == NULL)
75 fatal("out of mem");
76 st = &nf->nf_state[nf->nf_nstates++];
77 st->st_narcs = 0;
78 st->st_arc = NULL;
79 return st - nf->nf_state;
80}
81
82static void
83addnfaarc(nf, from, to, lbl)
84 nfa *nf;
85 int from, to, lbl;
86{
87 nfastate *st;
88 nfaarc *ar;
89
90 st = &nf->nf_state[from];
91 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
92 if (st->st_arc == NULL)
93 fatal("out of mem");
94 ar = &st->st_arc[st->st_narcs++];
95 ar->ar_label = lbl;
96 ar->ar_arrow = to;
97}
98
99static nfa *
100newnfa(name)
101 char *name;
102{
103 nfa *nf;
104 static type = NT_OFFSET; /* All types will be disjunct */
105
106 nf = NEW(nfa, 1);
107 if (nf == NULL)
108 fatal("no mem for new nfa");
109 nf->nf_type = type++;
110 nf->nf_name = name; /* XXX strdup(name) ??? */
111 nf->nf_nstates = 0;
112 nf->nf_state = NULL;
113 nf->nf_start = nf->nf_finish = -1;
114 return nf;
115}
116
117typedef struct _nfagrammar {
118 int gr_nnfas;
119 nfa **gr_nfa;
120 labellist gr_ll;
121} nfagrammar;
122
Guido van Rossum71f477c1991-04-03 19:09:02 +0000123/* Forward */
124static compile_rule PROTO((nfagrammar *gr, node *n));
125
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000126static nfagrammar *
127newnfagrammar()
128{
129 nfagrammar *gr;
130
131 gr = NEW(nfagrammar, 1);
132 if (gr == NULL)
133 fatal("no mem for new nfa grammar");
134 gr->gr_nnfas = 0;
135 gr->gr_nfa = NULL;
136 gr->gr_ll.ll_nlabels = 0;
137 gr->gr_ll.ll_label = NULL;
138 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
139 return gr;
140}
141
142static nfa *
143addnfa(gr, name)
144 nfagrammar *gr;
145 char *name;
146{
147 nfa *nf;
148
149 nf = newnfa(name);
150 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
151 if (gr->gr_nfa == NULL)
152 fatal("out of mem");
153 gr->gr_nfa[gr->gr_nnfas++] = nf;
154 addlabel(&gr->gr_ll, NAME, nf->nf_name);
155 return nf;
156}
157
158#ifdef DEBUG
159
160static char REQNFMT[] = "metacompile: less than %d children\n";
161
162#define REQN(i, count) \
163 if (i < count) { \
164 fprintf(stderr, REQNFMT, count); \
165 abort(); \
166 } else
167
168#else
169#define REQN(i, count) /* empty */
170#endif
171
172static nfagrammar *
173metacompile(n)
174 node *n;
175{
176 nfagrammar *gr;
177 int i;
178
179 printf("Compiling (meta-) parse tree into NFA grammar\n");
180 gr = newnfagrammar();
181 REQ(n, MSTART);
182 i = n->n_nchildren - 1; /* Last child is ENDMARKER */
183 n = n->n_child;
184 for (; --i >= 0; n++) {
185 if (n->n_type != NEWLINE)
186 compile_rule(gr, n);
187 }
188 return gr;
189}
190
191static
192compile_rule(gr, n)
193 nfagrammar *gr;
194 node *n;
195{
196 nfa *nf;
197
198 REQ(n, RULE);
199 REQN(n->n_nchildren, 4);
200 n = n->n_child;
201 REQ(n, NAME);
202 nf = addnfa(gr, n->n_str);
203 n++;
204 REQ(n, COLON);
205 n++;
206 REQ(n, RHS);
207 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
208 n++;
209 REQ(n, NEWLINE);
210}
211
212static
213compile_rhs(ll, nf, n, pa, pb)
214 labellist *ll;
215 nfa *nf;
216 node *n;
217 int *pa, *pb;
218{
219 int i;
220 int a, b;
221
222 REQ(n, RHS);
223 i = n->n_nchildren;
224 REQN(i, 1);
225 n = n->n_child;
226 REQ(n, ALT);
227 compile_alt(ll, nf, n, pa, pb);
228 if (--i <= 0)
229 return;
230 n++;
231 a = *pa;
232 b = *pb;
233 *pa = addnfastate(nf);
234 *pb = addnfastate(nf);
235 addnfaarc(nf, *pa, a, EMPTY);
236 addnfaarc(nf, b, *pb, EMPTY);
237 for (; --i >= 0; n++) {
238 REQ(n, VBAR);
239 REQN(i, 1);
240 --i;
241 n++;
242 REQ(n, ALT);
243 compile_alt(ll, nf, n, &a, &b);
244 addnfaarc(nf, *pa, a, EMPTY);
245 addnfaarc(nf, b, *pb, EMPTY);
246 }
247}
248
249static
250compile_alt(ll, nf, n, pa, pb)
251 labellist *ll;
252 nfa *nf;
253 node *n;
254 int *pa, *pb;
255{
256 int i;
257 int a, b;
258
259 REQ(n, ALT);
260 i = n->n_nchildren;
261 REQN(i, 1);
262 n = n->n_child;
263 REQ(n, ITEM);
264 compile_item(ll, nf, n, pa, pb);
265 --i;
266 n++;
267 for (; --i >= 0; n++) {
268 if (n->n_type == COMMA) { /* XXX Temporary */
269 REQN(i, 1);
270 --i;
271 n++;
272 }
273 REQ(n, ITEM);
274 compile_item(ll, nf, n, &a, &b);
275 addnfaarc(nf, *pb, a, EMPTY);
276 *pb = b;
277 }
278}
279
280static
281compile_item(ll, nf, n, pa, pb)
282 labellist *ll;
283 nfa *nf;
284 node *n;
285 int *pa, *pb;
286{
287 int i;
288 int a, b;
289
290 REQ(n, ITEM);
291 i = n->n_nchildren;
292 REQN(i, 1);
293 n = n->n_child;
294 if (n->n_type == LSQB) {
295 REQN(i, 3);
296 n++;
297 REQ(n, RHS);
298 *pa = addnfastate(nf);
299 *pb = addnfastate(nf);
300 addnfaarc(nf, *pa, *pb, EMPTY);
301 compile_rhs(ll, nf, n, &a, &b);
302 addnfaarc(nf, *pa, a, EMPTY);
303 addnfaarc(nf, b, *pb, EMPTY);
304 REQN(i, 1);
305 n++;
306 REQ(n, RSQB);
307 }
308 else {
309 compile_atom(ll, nf, n, pa, pb);
310 if (--i <= 0)
311 return;
312 n++;
313 addnfaarc(nf, *pb, *pa, EMPTY);
314 if (n->n_type == STAR)
315 *pb = *pa;
316 else
317 REQ(n, PLUS);
318 }
319}
320
321static
322compile_atom(ll, nf, n, pa, pb)
323 labellist *ll;
324 nfa *nf;
325 node *n;
326 int *pa, *pb;
327{
328 int i;
329
330 REQ(n, ATOM);
331 i = n->n_nchildren;
332 REQN(i, 1);
333 n = n->n_child;
334 if (n->n_type == LPAR) {
335 REQN(i, 3);
336 n++;
337 REQ(n, RHS);
338 compile_rhs(ll, nf, n, pa, pb);
339 n++;
340 REQ(n, RPAR);
341 }
342 else if (n->n_type == NAME || n->n_type == STRING) {
343 *pa = addnfastate(nf);
344 *pb = addnfastate(nf);
345 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
346 }
347 else
348 REQ(n, NAME);
349}
350
351static void
352dumpstate(ll, nf, istate)
353 labellist *ll;
354 nfa *nf;
355 int istate;
356{
357 nfastate *st;
358 int i;
359 nfaarc *ar;
360
361 printf("%c%2d%c",
362 istate == nf->nf_start ? '*' : ' ',
363 istate,
364 istate == nf->nf_finish ? '.' : ' ');
365 st = &nf->nf_state[istate];
366 ar = st->st_arc;
367 for (i = 0; i < st->st_narcs; i++) {
368 if (i > 0)
369 printf("\n ");
370 printf("-> %2d %s", ar->ar_arrow,
371 labelrepr(&ll->ll_label[ar->ar_label]));
372 ar++;
373 }
374 printf("\n");
375}
376
377static void
378dumpnfa(ll, nf)
379 labellist *ll;
380 nfa *nf;
381{
382 int i;
383
384 printf("NFA '%s' has %d states; start %d, finish %d\n",
385 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
386 for (i = 0; i < nf->nf_nstates; i++)
387 dumpstate(ll, nf, i);
388}
389
390
391/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
392
393static int
394addclosure(ss, nf, istate)
395 bitset ss;
396 nfa *nf;
397 int istate;
398{
399 if (addbit(ss, istate)) {
400 nfastate *st = &nf->nf_state[istate];
401 nfaarc *ar = st->st_arc;
402 int i;
403
404 for (i = st->st_narcs; --i >= 0; ) {
405 if (ar->ar_label == EMPTY)
406 addclosure(ss, nf, ar->ar_arrow);
407 ar++;
408 }
409 }
410}
411
412typedef struct _ss_arc {
413 bitset sa_bitset;
414 int sa_arrow;
415 int sa_label;
416} ss_arc;
417
418typedef struct _ss_state {
419 bitset ss_ss;
420 int ss_narcs;
421 ss_arc *ss_arc;
422 int ss_deleted;
423 int ss_finish;
424 int ss_rename;
425} ss_state;
426
427typedef struct _ss_dfa {
428 int sd_nstates;
429 ss_state *sd_state;
430} ss_dfa;
431
Guido van Rossum71f477c1991-04-03 19:09:02 +0000432/* Forward */
433static printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits,
434 labellist *ll, char *msg));
435static simplify PROTO((int xx_nstates, ss_state *xx_state));
436static convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
437
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000438static
439makedfa(gr, nf, d)
440 nfagrammar *gr;
441 nfa *nf;
442 dfa *d;
443{
444 int nbits = nf->nf_nstates;
445 bitset ss;
446 int xx_nstates;
447 ss_state *xx_state, *yy;
448 ss_arc *zz;
449 int istate, jstate, iarc, jarc, ibit;
450 nfastate *st;
451 nfaarc *ar;
452
453 ss = newbitset(nbits);
454 addclosure(ss, nf, nf->nf_start);
455 xx_state = NEW(ss_state, 1);
456 if (xx_state == NULL)
457 fatal("no mem for xx_state in makedfa");
458 xx_nstates = 1;
459 yy = &xx_state[0];
460 yy->ss_ss = ss;
461 yy->ss_narcs = 0;
462 yy->ss_arc = NULL;
463 yy->ss_deleted = 0;
464 yy->ss_finish = testbit(ss, nf->nf_finish);
465 if (yy->ss_finish)
466 printf("Error: nonterminal '%s' may produce empty.\n",
467 nf->nf_name);
468
469 /* This algorithm is from a book written before
470 the invention of structured programming... */
471
472 /* For each unmarked state... */
473 for (istate = 0; istate < xx_nstates; ++istate) {
474 yy = &xx_state[istate];
475 ss = yy->ss_ss;
476 /* For all its states... */
477 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
478 if (!testbit(ss, ibit))
479 continue;
480 st = &nf->nf_state[ibit];
481 /* For all non-empty arcs from this state... */
482 for (iarc = 0; iarc < st->st_narcs; iarc++) {
483 ar = &st->st_arc[iarc];
484 if (ar->ar_label == EMPTY)
485 continue;
486 /* Look up in list of arcs from this state */
487 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
488 zz = &yy->ss_arc[jarc];
489 if (ar->ar_label == zz->sa_label)
490 goto found;
491 }
492 /* Add new arc for this state */
493 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
494 if (yy->ss_arc == NULL)
495 fatal("out of mem");
496 zz = &yy->ss_arc[yy->ss_narcs++];
497 zz->sa_label = ar->ar_label;
498 zz->sa_bitset = newbitset(nbits);
499 zz->sa_arrow = -1;
500 found: ;
501 /* Add destination */
502 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
503 }
504 }
505 /* Now look up all the arrow states */
506 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
507 zz = &xx_state[istate].ss_arc[jarc];
508 for (jstate = 0; jstate < xx_nstates; jstate++) {
509 if (samebitset(zz->sa_bitset,
510 xx_state[jstate].ss_ss, nbits)) {
511 zz->sa_arrow = jstate;
512 goto done;
513 }
514 }
515 RESIZE(xx_state, ss_state, xx_nstates + 1);
516 if (xx_state == NULL)
517 fatal("out of mem");
518 zz->sa_arrow = xx_nstates;
519 yy = &xx_state[xx_nstates++];
520 yy->ss_ss = zz->sa_bitset;
521 yy->ss_narcs = 0;
522 yy->ss_arc = NULL;
523 yy->ss_deleted = 0;
524 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
525 done: ;
526 }
527 }
528
529 if (debugging)
530 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
531 "before minimizing");
532
533 simplify(xx_nstates, xx_state);
534
535 if (debugging)
536 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
537 "after minimizing");
538
539 convert(d, xx_nstates, xx_state);
540
541 /* XXX cleanup */
542}
543
544static
545printssdfa(xx_nstates, xx_state, nbits, ll, msg)
546 int xx_nstates;
547 ss_state *xx_state;
548 int nbits;
549 labellist *ll;
550 char *msg;
551{
552 int i, ibit, iarc;
553 ss_state *yy;
554 ss_arc *zz;
555
556 printf("Subset DFA %s\n", msg);
557 for (i = 0; i < xx_nstates; i++) {
558 yy = &xx_state[i];
559 if (yy->ss_deleted)
560 continue;
561 printf(" Subset %d", i);
562 if (yy->ss_finish)
563 printf(" (finish)");
564 printf(" { ");
565 for (ibit = 0; ibit < nbits; ibit++) {
566 if (testbit(yy->ss_ss, ibit))
567 printf("%d ", ibit);
568 }
569 printf("}\n");
570 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
571 zz = &yy->ss_arc[iarc];
572 printf(" Arc to state %d, label %s\n",
573 zz->sa_arrow,
574 labelrepr(&ll->ll_label[zz->sa_label]));
575 }
576 }
577}
578
579
580/* PART THREE -- SIMPLIFY DFA */
581
582/* Simplify the DFA by repeatedly eliminating states that are
583 equivalent to another oner. This is NOT Algorithm 3.3 from
584 [Aho&Ullman 77]. It does not always finds the minimal DFA,
585 but it does usually make a much smaller one... (For an example
586 of sub-optimal behaviour, try S: x a b+ | y a b+.)
587*/
588
589static int
590samestate(s1, s2)
591 ss_state *s1, *s2;
592{
593 int i;
594
595 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
596 return 0;
597 for (i = 0; i < s1->ss_narcs; i++) {
598 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
599 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
600 return 0;
601 }
602 return 1;
603}
604
605static void
606renamestates(xx_nstates, xx_state, from, to)
607 int xx_nstates;
608 ss_state *xx_state;
609 int from, to;
610{
611 int i, j;
612
613 if (debugging)
614 printf("Rename state %d to %d.\n", from, to);
615 for (i = 0; i < xx_nstates; i++) {
616 if (xx_state[i].ss_deleted)
617 continue;
618 for (j = 0; j < xx_state[i].ss_narcs; j++) {
619 if (xx_state[i].ss_arc[j].sa_arrow == from)
620 xx_state[i].ss_arc[j].sa_arrow = to;
621 }
622 }
623}
624
625static
626simplify(xx_nstates, xx_state)
627 int xx_nstates;
628 ss_state *xx_state;
629{
630 int changes;
631 int i, j, k;
632
633 do {
634 changes = 0;
635 for (i = 1; i < xx_nstates; i++) {
636 if (xx_state[i].ss_deleted)
637 continue;
638 for (j = 0; j < i; j++) {
639 if (xx_state[j].ss_deleted)
640 continue;
641 if (samestate(&xx_state[i], &xx_state[j])) {
642 xx_state[i].ss_deleted++;
643 renamestates(xx_nstates, xx_state, i, j);
644 changes++;
645 break;
646 }
647 }
648 }
649 } while (changes);
650}
651
652
653/* PART FOUR -- GENERATE PARSING TABLES */
654
655/* Convert the DFA into a grammar that can be used by our parser */
656
657static
658convert(d, xx_nstates, xx_state)
659 dfa *d;
660 int xx_nstates;
661 ss_state *xx_state;
662{
663 int i, j;
664 ss_state *yy;
665 ss_arc *zz;
666
667 for (i = 0; i < xx_nstates; i++) {
668 yy = &xx_state[i];
669 if (yy->ss_deleted)
670 continue;
671 yy->ss_rename = addstate(d);
672 }
673
674 for (i = 0; i < xx_nstates; i++) {
675 yy = &xx_state[i];
676 if (yy->ss_deleted)
677 continue;
678 for (j = 0; j < yy->ss_narcs; j++) {
679 zz = &yy->ss_arc[j];
680 addarc(d, yy->ss_rename,
681 xx_state[zz->sa_arrow].ss_rename,
682 zz->sa_label);
683 }
684 if (yy->ss_finish)
685 addarc(d, yy->ss_rename, yy->ss_rename, 0);
686 }
687
688 d->d_initial = 0;
689}
690
691
692/* PART FIVE -- GLUE IT ALL TOGETHER */
693
694static grammar *
695maketables(gr)
696 nfagrammar *gr;
697{
698 int i;
699 nfa *nf;
700 dfa *d;
701 grammar *g;
702
703 if (gr->gr_nnfas == 0)
704 return NULL;
705 g = newgrammar(gr->gr_nfa[0]->nf_type);
706 /* XXX first rule must be start rule */
707 g->g_ll = gr->gr_ll;
708
709 for (i = 0; i < gr->gr_nnfas; i++) {
710 nf = gr->gr_nfa[i];
711 if (debugging) {
712 printf("Dump of NFA for '%s' ...\n", nf->nf_name);
713 dumpnfa(&gr->gr_ll, nf);
714 }
715 printf("Making DFA for '%s' ...\n", nf->nf_name);
716 d = adddfa(g, nf->nf_type, nf->nf_name);
717 makedfa(gr, gr->gr_nfa[i], d);
718 }
719
720 return g;
721}
722
723grammar *
724pgen(n)
725 node *n;
726{
727 nfagrammar *gr;
728 grammar *g;
729
730 gr = metacompile(n);
731 g = maketables(gr);
732 translatelabels(g);
733 addfirstsets(g);
734 return g;
735}
736
737
738/*
739
740Description
741-----------
742
743Input is a grammar in extended BNF (using * for repetition, + for
744at-least-once repetition, [] for optional parts, | for alternatives and
745() for grouping). This has already been parsed and turned into a parse
746tree.
747
748Each rule is considered as a regular expression in its own right.
749It is turned into a Non-deterministic Finite Automaton (NFA), which
750is then turned into a Deterministic Finite Automaton (DFA), which is then
751optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
752or similar compiler books (this technique is more often used for lexical
753analyzers).
754
755The DFA's are used by the parser as parsing tables in a special way
756that's probably unique. Before they are usable, the FIRST sets of all
757non-terminals are computed.
758
759Reference
760---------
761
762[Aho&Ullman 77]
763 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
764 (first edition)
765
766*/