blob: a8c016bb857e42f1ae8699482958bd3e3bf99ae9 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Parser generator */
Guido van Rossum3f5da241990-12-20 15:06:42 +000026/* XXX This file is not yet fully PROTOized */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027
28/* For a description, see the comments at end of this file */
29
Guido van Rossum3f5da241990-12-20 15:06:42 +000030#include "pgenheaders.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000031#include "assert.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032#include "token.h"
33#include "node.h"
34#include "grammar.h"
35#include "metagrammar.h"
36#include "pgen.h"
37
38extern int debugging;
39
40
41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
42
43typedef struct _nfaarc {
44 int ar_label;
45 int ar_arrow;
46} nfaarc;
47
48typedef struct _nfastate {
49 int st_narcs;
50 nfaarc *st_arc;
51} nfastate;
52
53typedef struct _nfa {
54 int nf_type;
55 char *nf_name;
56 int nf_nstates;
57 nfastate *nf_state;
58 int nf_start, nf_finish;
59} nfa;
60
61static int
62addnfastate(nf)
63 nfa *nf;
64{
65 nfastate *st;
66
67 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
68 if (nf->nf_state == NULL)
69 fatal("out of mem");
70 st = &nf->nf_state[nf->nf_nstates++];
71 st->st_narcs = 0;
72 st->st_arc = NULL;
73 return st - nf->nf_state;
74}
75
76static void
77addnfaarc(nf, from, to, lbl)
78 nfa *nf;
79 int from, to, lbl;
80{
81 nfastate *st;
82 nfaarc *ar;
83
84 st = &nf->nf_state[from];
85 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
86 if (st->st_arc == NULL)
87 fatal("out of mem");
88 ar = &st->st_arc[st->st_narcs++];
89 ar->ar_label = lbl;
90 ar->ar_arrow = to;
91}
92
93static nfa *
94newnfa(name)
95 char *name;
96{
97 nfa *nf;
98 static type = NT_OFFSET; /* All types will be disjunct */
99
100 nf = NEW(nfa, 1);
101 if (nf == NULL)
102 fatal("no mem for new nfa");
103 nf->nf_type = type++;
104 nf->nf_name = name; /* XXX strdup(name) ??? */
105 nf->nf_nstates = 0;
106 nf->nf_state = NULL;
107 nf->nf_start = nf->nf_finish = -1;
108 return nf;
109}
110
111typedef struct _nfagrammar {
112 int gr_nnfas;
113 nfa **gr_nfa;
114 labellist gr_ll;
115} nfagrammar;
116
117static nfagrammar *
118newnfagrammar()
119{
120 nfagrammar *gr;
121
122 gr = NEW(nfagrammar, 1);
123 if (gr == NULL)
124 fatal("no mem for new nfa grammar");
125 gr->gr_nnfas = 0;
126 gr->gr_nfa = NULL;
127 gr->gr_ll.ll_nlabels = 0;
128 gr->gr_ll.ll_label = NULL;
129 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
130 return gr;
131}
132
133static nfa *
134addnfa(gr, name)
135 nfagrammar *gr;
136 char *name;
137{
138 nfa *nf;
139
140 nf = newnfa(name);
141 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
142 if (gr->gr_nfa == NULL)
143 fatal("out of mem");
144 gr->gr_nfa[gr->gr_nnfas++] = nf;
145 addlabel(&gr->gr_ll, NAME, nf->nf_name);
146 return nf;
147}
148
149#ifdef DEBUG
150
151static char REQNFMT[] = "metacompile: less than %d children\n";
152
153#define REQN(i, count) \
154 if (i < count) { \
155 fprintf(stderr, REQNFMT, count); \
156 abort(); \
157 } else
158
159#else
160#define REQN(i, count) /* empty */
161#endif
162
163static nfagrammar *
164metacompile(n)
165 node *n;
166{
167 nfagrammar *gr;
168 int i;
169
170 printf("Compiling (meta-) parse tree into NFA grammar\n");
171 gr = newnfagrammar();
172 REQ(n, MSTART);
173 i = n->n_nchildren - 1; /* Last child is ENDMARKER */
174 n = n->n_child;
175 for (; --i >= 0; n++) {
176 if (n->n_type != NEWLINE)
177 compile_rule(gr, n);
178 }
179 return gr;
180}
181
182static
183compile_rule(gr, n)
184 nfagrammar *gr;
185 node *n;
186{
187 nfa *nf;
188
189 REQ(n, RULE);
190 REQN(n->n_nchildren, 4);
191 n = n->n_child;
192 REQ(n, NAME);
193 nf = addnfa(gr, n->n_str);
194 n++;
195 REQ(n, COLON);
196 n++;
197 REQ(n, RHS);
198 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
199 n++;
200 REQ(n, NEWLINE);
201}
202
203static
204compile_rhs(ll, nf, n, pa, pb)
205 labellist *ll;
206 nfa *nf;
207 node *n;
208 int *pa, *pb;
209{
210 int i;
211 int a, b;
212
213 REQ(n, RHS);
214 i = n->n_nchildren;
215 REQN(i, 1);
216 n = n->n_child;
217 REQ(n, ALT);
218 compile_alt(ll, nf, n, pa, pb);
219 if (--i <= 0)
220 return;
221 n++;
222 a = *pa;
223 b = *pb;
224 *pa = addnfastate(nf);
225 *pb = addnfastate(nf);
226 addnfaarc(nf, *pa, a, EMPTY);
227 addnfaarc(nf, b, *pb, EMPTY);
228 for (; --i >= 0; n++) {
229 REQ(n, VBAR);
230 REQN(i, 1);
231 --i;
232 n++;
233 REQ(n, ALT);
234 compile_alt(ll, nf, n, &a, &b);
235 addnfaarc(nf, *pa, a, EMPTY);
236 addnfaarc(nf, b, *pb, EMPTY);
237 }
238}
239
240static
241compile_alt(ll, nf, n, pa, pb)
242 labellist *ll;
243 nfa *nf;
244 node *n;
245 int *pa, *pb;
246{
247 int i;
248 int a, b;
249
250 REQ(n, ALT);
251 i = n->n_nchildren;
252 REQN(i, 1);
253 n = n->n_child;
254 REQ(n, ITEM);
255 compile_item(ll, nf, n, pa, pb);
256 --i;
257 n++;
258 for (; --i >= 0; n++) {
259 if (n->n_type == COMMA) { /* XXX Temporary */
260 REQN(i, 1);
261 --i;
262 n++;
263 }
264 REQ(n, ITEM);
265 compile_item(ll, nf, n, &a, &b);
266 addnfaarc(nf, *pb, a, EMPTY);
267 *pb = b;
268 }
269}
270
271static
272compile_item(ll, nf, n, pa, pb)
273 labellist *ll;
274 nfa *nf;
275 node *n;
276 int *pa, *pb;
277{
278 int i;
279 int a, b;
280
281 REQ(n, ITEM);
282 i = n->n_nchildren;
283 REQN(i, 1);
284 n = n->n_child;
285 if (n->n_type == LSQB) {
286 REQN(i, 3);
287 n++;
288 REQ(n, RHS);
289 *pa = addnfastate(nf);
290 *pb = addnfastate(nf);
291 addnfaarc(nf, *pa, *pb, EMPTY);
292 compile_rhs(ll, nf, n, &a, &b);
293 addnfaarc(nf, *pa, a, EMPTY);
294 addnfaarc(nf, b, *pb, EMPTY);
295 REQN(i, 1);
296 n++;
297 REQ(n, RSQB);
298 }
299 else {
300 compile_atom(ll, nf, n, pa, pb);
301 if (--i <= 0)
302 return;
303 n++;
304 addnfaarc(nf, *pb, *pa, EMPTY);
305 if (n->n_type == STAR)
306 *pb = *pa;
307 else
308 REQ(n, PLUS);
309 }
310}
311
312static
313compile_atom(ll, nf, n, pa, pb)
314 labellist *ll;
315 nfa *nf;
316 node *n;
317 int *pa, *pb;
318{
319 int i;
320
321 REQ(n, ATOM);
322 i = n->n_nchildren;
323 REQN(i, 1);
324 n = n->n_child;
325 if (n->n_type == LPAR) {
326 REQN(i, 3);
327 n++;
328 REQ(n, RHS);
329 compile_rhs(ll, nf, n, pa, pb);
330 n++;
331 REQ(n, RPAR);
332 }
333 else if (n->n_type == NAME || n->n_type == STRING) {
334 *pa = addnfastate(nf);
335 *pb = addnfastate(nf);
336 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
337 }
338 else
339 REQ(n, NAME);
340}
341
342static void
343dumpstate(ll, nf, istate)
344 labellist *ll;
345 nfa *nf;
346 int istate;
347{
348 nfastate *st;
349 int i;
350 nfaarc *ar;
351
352 printf("%c%2d%c",
353 istate == nf->nf_start ? '*' : ' ',
354 istate,
355 istate == nf->nf_finish ? '.' : ' ');
356 st = &nf->nf_state[istate];
357 ar = st->st_arc;
358 for (i = 0; i < st->st_narcs; i++) {
359 if (i > 0)
360 printf("\n ");
361 printf("-> %2d %s", ar->ar_arrow,
362 labelrepr(&ll->ll_label[ar->ar_label]));
363 ar++;
364 }
365 printf("\n");
366}
367
368static void
369dumpnfa(ll, nf)
370 labellist *ll;
371 nfa *nf;
372{
373 int i;
374
375 printf("NFA '%s' has %d states; start %d, finish %d\n",
376 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
377 for (i = 0; i < nf->nf_nstates; i++)
378 dumpstate(ll, nf, i);
379}
380
381
382/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
383
384static int
385addclosure(ss, nf, istate)
386 bitset ss;
387 nfa *nf;
388 int istate;
389{
390 if (addbit(ss, istate)) {
391 nfastate *st = &nf->nf_state[istate];
392 nfaarc *ar = st->st_arc;
393 int i;
394
395 for (i = st->st_narcs; --i >= 0; ) {
396 if (ar->ar_label == EMPTY)
397 addclosure(ss, nf, ar->ar_arrow);
398 ar++;
399 }
400 }
401}
402
403typedef struct _ss_arc {
404 bitset sa_bitset;
405 int sa_arrow;
406 int sa_label;
407} ss_arc;
408
409typedef struct _ss_state {
410 bitset ss_ss;
411 int ss_narcs;
412 ss_arc *ss_arc;
413 int ss_deleted;
414 int ss_finish;
415 int ss_rename;
416} ss_state;
417
418typedef struct _ss_dfa {
419 int sd_nstates;
420 ss_state *sd_state;
421} ss_dfa;
422
423static
424makedfa(gr, nf, d)
425 nfagrammar *gr;
426 nfa *nf;
427 dfa *d;
428{
429 int nbits = nf->nf_nstates;
430 bitset ss;
431 int xx_nstates;
432 ss_state *xx_state, *yy;
433 ss_arc *zz;
434 int istate, jstate, iarc, jarc, ibit;
435 nfastate *st;
436 nfaarc *ar;
437
438 ss = newbitset(nbits);
439 addclosure(ss, nf, nf->nf_start);
440 xx_state = NEW(ss_state, 1);
441 if (xx_state == NULL)
442 fatal("no mem for xx_state in makedfa");
443 xx_nstates = 1;
444 yy = &xx_state[0];
445 yy->ss_ss = ss;
446 yy->ss_narcs = 0;
447 yy->ss_arc = NULL;
448 yy->ss_deleted = 0;
449 yy->ss_finish = testbit(ss, nf->nf_finish);
450 if (yy->ss_finish)
451 printf("Error: nonterminal '%s' may produce empty.\n",
452 nf->nf_name);
453
454 /* This algorithm is from a book written before
455 the invention of structured programming... */
456
457 /* For each unmarked state... */
458 for (istate = 0; istate < xx_nstates; ++istate) {
459 yy = &xx_state[istate];
460 ss = yy->ss_ss;
461 /* For all its states... */
462 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
463 if (!testbit(ss, ibit))
464 continue;
465 st = &nf->nf_state[ibit];
466 /* For all non-empty arcs from this state... */
467 for (iarc = 0; iarc < st->st_narcs; iarc++) {
468 ar = &st->st_arc[iarc];
469 if (ar->ar_label == EMPTY)
470 continue;
471 /* Look up in list of arcs from this state */
472 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
473 zz = &yy->ss_arc[jarc];
474 if (ar->ar_label == zz->sa_label)
475 goto found;
476 }
477 /* Add new arc for this state */
478 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
479 if (yy->ss_arc == NULL)
480 fatal("out of mem");
481 zz = &yy->ss_arc[yy->ss_narcs++];
482 zz->sa_label = ar->ar_label;
483 zz->sa_bitset = newbitset(nbits);
484 zz->sa_arrow = -1;
485 found: ;
486 /* Add destination */
487 addclosure(zz->sa_bitset, nf, ar->ar_arrow);
488 }
489 }
490 /* Now look up all the arrow states */
491 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
492 zz = &xx_state[istate].ss_arc[jarc];
493 for (jstate = 0; jstate < xx_nstates; jstate++) {
494 if (samebitset(zz->sa_bitset,
495 xx_state[jstate].ss_ss, nbits)) {
496 zz->sa_arrow = jstate;
497 goto done;
498 }
499 }
500 RESIZE(xx_state, ss_state, xx_nstates + 1);
501 if (xx_state == NULL)
502 fatal("out of mem");
503 zz->sa_arrow = xx_nstates;
504 yy = &xx_state[xx_nstates++];
505 yy->ss_ss = zz->sa_bitset;
506 yy->ss_narcs = 0;
507 yy->ss_arc = NULL;
508 yy->ss_deleted = 0;
509 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
510 done: ;
511 }
512 }
513
514 if (debugging)
515 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
516 "before minimizing");
517
518 simplify(xx_nstates, xx_state);
519
520 if (debugging)
521 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
522 "after minimizing");
523
524 convert(d, xx_nstates, xx_state);
525
526 /* XXX cleanup */
527}
528
529static
530printssdfa(xx_nstates, xx_state, nbits, ll, msg)
531 int xx_nstates;
532 ss_state *xx_state;
533 int nbits;
534 labellist *ll;
535 char *msg;
536{
537 int i, ibit, iarc;
538 ss_state *yy;
539 ss_arc *zz;
540
541 printf("Subset DFA %s\n", msg);
542 for (i = 0; i < xx_nstates; i++) {
543 yy = &xx_state[i];
544 if (yy->ss_deleted)
545 continue;
546 printf(" Subset %d", i);
547 if (yy->ss_finish)
548 printf(" (finish)");
549 printf(" { ");
550 for (ibit = 0; ibit < nbits; ibit++) {
551 if (testbit(yy->ss_ss, ibit))
552 printf("%d ", ibit);
553 }
554 printf("}\n");
555 for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
556 zz = &yy->ss_arc[iarc];
557 printf(" Arc to state %d, label %s\n",
558 zz->sa_arrow,
559 labelrepr(&ll->ll_label[zz->sa_label]));
560 }
561 }
562}
563
564
565/* PART THREE -- SIMPLIFY DFA */
566
567/* Simplify the DFA by repeatedly eliminating states that are
568 equivalent to another oner. This is NOT Algorithm 3.3 from
569 [Aho&Ullman 77]. It does not always finds the minimal DFA,
570 but it does usually make a much smaller one... (For an example
571 of sub-optimal behaviour, try S: x a b+ | y a b+.)
572*/
573
574static int
575samestate(s1, s2)
576 ss_state *s1, *s2;
577{
578 int i;
579
580 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
581 return 0;
582 for (i = 0; i < s1->ss_narcs; i++) {
583 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
584 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
585 return 0;
586 }
587 return 1;
588}
589
590static void
591renamestates(xx_nstates, xx_state, from, to)
592 int xx_nstates;
593 ss_state *xx_state;
594 int from, to;
595{
596 int i, j;
597
598 if (debugging)
599 printf("Rename state %d to %d.\n", from, to);
600 for (i = 0; i < xx_nstates; i++) {
601 if (xx_state[i].ss_deleted)
602 continue;
603 for (j = 0; j < xx_state[i].ss_narcs; j++) {
604 if (xx_state[i].ss_arc[j].sa_arrow == from)
605 xx_state[i].ss_arc[j].sa_arrow = to;
606 }
607 }
608}
609
610static
611simplify(xx_nstates, xx_state)
612 int xx_nstates;
613 ss_state *xx_state;
614{
615 int changes;
616 int i, j, k;
617
618 do {
619 changes = 0;
620 for (i = 1; i < xx_nstates; i++) {
621 if (xx_state[i].ss_deleted)
622 continue;
623 for (j = 0; j < i; j++) {
624 if (xx_state[j].ss_deleted)
625 continue;
626 if (samestate(&xx_state[i], &xx_state[j])) {
627 xx_state[i].ss_deleted++;
628 renamestates(xx_nstates, xx_state, i, j);
629 changes++;
630 break;
631 }
632 }
633 }
634 } while (changes);
635}
636
637
638/* PART FOUR -- GENERATE PARSING TABLES */
639
640/* Convert the DFA into a grammar that can be used by our parser */
641
642static
643convert(d, xx_nstates, xx_state)
644 dfa *d;
645 int xx_nstates;
646 ss_state *xx_state;
647{
648 int i, j;
649 ss_state *yy;
650 ss_arc *zz;
651
652 for (i = 0; i < xx_nstates; i++) {
653 yy = &xx_state[i];
654 if (yy->ss_deleted)
655 continue;
656 yy->ss_rename = addstate(d);
657 }
658
659 for (i = 0; i < xx_nstates; i++) {
660 yy = &xx_state[i];
661 if (yy->ss_deleted)
662 continue;
663 for (j = 0; j < yy->ss_narcs; j++) {
664 zz = &yy->ss_arc[j];
665 addarc(d, yy->ss_rename,
666 xx_state[zz->sa_arrow].ss_rename,
667 zz->sa_label);
668 }
669 if (yy->ss_finish)
670 addarc(d, yy->ss_rename, yy->ss_rename, 0);
671 }
672
673 d->d_initial = 0;
674}
675
676
677/* PART FIVE -- GLUE IT ALL TOGETHER */
678
679static grammar *
680maketables(gr)
681 nfagrammar *gr;
682{
683 int i;
684 nfa *nf;
685 dfa *d;
686 grammar *g;
687
688 if (gr->gr_nnfas == 0)
689 return NULL;
690 g = newgrammar(gr->gr_nfa[0]->nf_type);
691 /* XXX first rule must be start rule */
692 g->g_ll = gr->gr_ll;
693
694 for (i = 0; i < gr->gr_nnfas; i++) {
695 nf = gr->gr_nfa[i];
696 if (debugging) {
697 printf("Dump of NFA for '%s' ...\n", nf->nf_name);
698 dumpnfa(&gr->gr_ll, nf);
699 }
700 printf("Making DFA for '%s' ...\n", nf->nf_name);
701 d = adddfa(g, nf->nf_type, nf->nf_name);
702 makedfa(gr, gr->gr_nfa[i], d);
703 }
704
705 return g;
706}
707
708grammar *
709pgen(n)
710 node *n;
711{
712 nfagrammar *gr;
713 grammar *g;
714
715 gr = metacompile(n);
716 g = maketables(gr);
717 translatelabels(g);
718 addfirstsets(g);
719 return g;
720}
721
722
723/*
724
725Description
726-----------
727
728Input is a grammar in extended BNF (using * for repetition, + for
729at-least-once repetition, [] for optional parts, | for alternatives and
730() for grouping). This has already been parsed and turned into a parse
731tree.
732
733Each rule is considered as a regular expression in its own right.
734It is turned into a Non-deterministic Finite Automaton (NFA), which
735is then turned into a Deterministic Finite Automaton (DFA), which is then
736optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
737or similar compiler books (this technique is more often used for lexical
738analyzers).
739
740The DFA's are used by the parser as parsing tables in a special way
741that's probably unique. Before they are usable, the FIRST sets of all
742non-terminals are computed.
743
744Reference
745---------
746
747[Aho&Ullman 77]
748 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
749 (first edition)
750
751*/