blob: c21b6fdf466d9984db326915c316675129005722 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Parser implementation */
3
4/* For a description, see the comments at end of this file */
5
6/* XXX To do: error recovery */
7
Tim Peters1ca12962001-12-04 03:18:48 +00008#include "Python.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009#include "token.h"
10#include "grammar.h"
11#include "node.h"
12#include "parser.h"
13#include "errcode.h"
Guido van Rossumdcfcd142019-01-31 03:40:27 -080014#include "graminit.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000015
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000016
Guido van Rossum408027e1996-12-30 16:17:54 +000017#ifdef Py_DEBUG
Guido van Rossum86bea461997-04-29 21:03:06 +000018extern int Py_DebugFlag;
19#define D(x) if (!Py_DebugFlag); else x
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000020#else
21#define D(x)
22#endif
23
24
25/* STACK DATA TYPE */
26
Tim Petersdbd9ba62000-07-09 03:09:57 +000027static void s_reset(stack *);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
29static void
Thomas Wouters23c9e002000-07-22 19:20:54 +000030s_reset(stack *s)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000031{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 s->s_top = &s->s_base[MAXSTACK];
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033}
34
35#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020038s_push(stack *s, dfa *d, node *parent)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000039{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020040 stackentry *top;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
44 }
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000050}
51
Guido van Rossum408027e1996-12-30 16:17:54 +000052#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000054static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020055s_pop(stack *s)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000057 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000060}
61
Guido van Rossum408027e1996-12-30 16:17:54 +000062#else /* !Py_DEBUG */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000063
64#define s_pop(s) (s)->s_top++
65
66#endif
67
68
69/* PARSER CREATION */
70
71parser_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000072PyParser_New(grammar *g, int start)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000073{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 parser_state *ps;
75
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
Thomas Wouters34aa7ba2006-02-28 19:02:24 +000082#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 ps->p_flags = 0;
Neil Schemenauerc155dd42002-03-22 23:38:11 +000084#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000085 ps->p_tree = PyNode_New(start);
86 if (ps->p_tree == NULL) {
87 PyMem_FREE(ps);
88 return NULL;
89 }
90 s_reset(&ps->p_stack);
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 return ps;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093}
94
95void
Thomas Wouters23c9e002000-07-22 19:20:54 +000096PyParser_Delete(parser_state *ps)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps->p_tree);
101 PyMem_FREE(ps);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000102}
103
104
105/* PARSER STACK OPERATIONS */
106
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107static int
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000108shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
109 int end_lineno, int end_col_offset)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000111 int err;
112 assert(!s_empty(s));
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000113 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
114 end_lineno, end_col_offset);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 if (err)
116 return err;
117 s->s_top->s_state = newstate;
118 return 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000119}
120
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000121static int
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000122push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset,
123 int end_lineno, int end_col_offset)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000124{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000125 int err;
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200126 node *n;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 n = s->s_top->s_parent;
128 assert(!s_empty(s));
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000129 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
130 end_lineno, end_col_offset);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 if (err)
132 return err;
133 s->s_top->s_state = newstate;
134 return s_push(s, d, CHILD(n, NCH(n)-1));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135}
136
137
138/* PARSER PROPER */
139
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000140static int
Serhiy Storchakac6792272013-10-19 21:03:34 +0300141classify(parser_state *ps, int type, const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000142{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 grammar *g = ps->p_grammar;
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200144 int n = g->g_ll.ll_nlabels;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000145
146 if (type == NAME) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200147 label *l = g->g_ll.ll_label;
148 int i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000149 for (i = n; i > 0; i--, l++) {
150 if (l->lb_type != NAME || l->lb_str == NULL ||
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300151 l->lb_str[0] != str[0] ||
152 strcmp(l->lb_str, str) != 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 continue;
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000154#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000155#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Leaving this in as an example */
157 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300158 if (str[0] == 'w' && strcmp(str, "with") == 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 break; /* not a keyword yet */
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300160 else if (str[0] == 'a' && strcmp(str, "as") == 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 break; /* not a keyword yet */
162 }
Thomas Wouters8ae12952006-02-28 22:42:15 +0000163#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000164#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 D(printf("It's a keyword\n"));
166 return n - i;
167 }
168 }
169
170 {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200171 label *l = g->g_ll.ll_label;
172 int i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 for (i = n; i > 0; i--, l++) {
174 if (l->lb_type == type && l->lb_str == NULL) {
175 D(printf("It's a token we know\n"));
176 return n - i;
177 }
178 }
179 }
180
181 D(printf("Illegal token\n"));
182 return -1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000183}
184
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000185#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000186#if 0
Guido van Rossum45aecf42006-03-15 04:58:47 +0000187/* Leaving this in as an example */
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000188static void
189future_hack(parser_state *ps)
190{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 node *n = ps->p_stack.s_top->s_parent;
192 node *ch, *cch;
193 int i;
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000194
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 /* from __future__ import ..., must have at least 4 children */
196 n = CHILD(n, 0);
197 if (NCH(n) < 4)
198 return;
199 ch = CHILD(n, 0);
200 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
201 return;
202 ch = CHILD(n, 1);
203 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
204 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
205 return;
206 ch = CHILD(n, 3);
207 /* ch can be a star, a parenthesis or import_as_names */
208 if (TYPE(ch) == STAR)
209 return;
210 if (TYPE(ch) == LPAR)
211 ch = CHILD(n, 4);
212
213 for (i = 0; i < NCH(ch); i += 2) {
214 cch = CHILD(ch, i);
215 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
216 char *str_ch = STR(CHILD(cch, 0));
217 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
218 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
219 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
220 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
221 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
222 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
223 }
224 }
225 }
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000226}
Brett Cannone3944a52009-04-01 05:08:41 +0000227#endif
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000228#endif /* future keyword */
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000229
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000230int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200231PyParser_AddToken(parser_state *ps, int type, char *str,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000232 int lineno, int col_offset,
233 int end_lineno, int end_col_offset,
234 int *expected_ret)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000235{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200236 int ilabel;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000237 int err;
238
239 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
240
241 /* Find out which label this token is */
242 ilabel = classify(ps, type, str);
243 if (ilabel < 0)
244 return E_SYNTAX;
245
246 /* Loop until the token is shifted or an error occurred */
247 for (;;) {
248 /* Fetch the current dfa and state */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200249 dfa *d = ps->p_stack.s_top->s_dfa;
250 state *s = &d->d_state[ps->p_stack.s_top->s_state];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251
252 D(printf(" DFA '%s', state %d:",
253 d->d_name, ps->p_stack.s_top->s_state));
254
255 /* Check accelerator */
256 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200257 int x = s->s_accel[ilabel - s->s_lower];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 if (x != -1) {
259 if (x & (1<<7)) {
260 /* Push non-terminal */
261 int nt = (x >> 8) + NT_OFFSET;
262 int arrow = x & ((1<<7)-1);
Guido van Rossumdcfcd142019-01-31 03:40:27 -0800263 dfa *d1;
264 if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
265 /* When parsing type comments is not requested,
266 we can provide better errors about bad indentation
267 by using 'suite' for the body of a funcdef */
268 D(printf(" [switch func_body_suite to suite]"));
269 nt = suite;
270 }
271 d1 = PyGrammar_FindDFA(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000272 ps->p_grammar, nt);
273 if ((err = push(&ps->p_stack, nt, d1,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000274 arrow, lineno, col_offset,
275 end_lineno, end_col_offset)) > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 D(printf(" MemError: push\n"));
277 return err;
278 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -0800279 D(printf(" Push '%s'\n", d1->d_name));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 continue;
281 }
282
283 /* Shift the token */
284 if ((err = shift(&ps->p_stack, type, str,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000285 x, lineno, col_offset,
286 end_lineno, end_col_offset)) > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 D(printf(" MemError: shift.\n"));
288 return err;
289 }
290 D(printf(" Shift.\n"));
291 /* Pop while we are in an accept-only state */
292 while (s = &d->d_state
293 [ps->p_stack.s_top->s_state],
294 s->s_accept && s->s_narcs == 1) {
295 D(printf(" DFA '%s', state %d: "
296 "Direct pop.\n",
297 d->d_name,
298 ps->p_stack.s_top->s_state));
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000299#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000300#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 if (d->d_name[0] == 'i' &&
302 strcmp(d->d_name,
303 "import_stmt") == 0)
304 future_hack(ps);
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000305#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000306#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000307 s_pop(&ps->p_stack);
308 if (s_empty(&ps->p_stack)) {
309 D(printf(" ACCEPT.\n"));
310 return E_DONE;
311 }
312 d = ps->p_stack.s_top->s_dfa;
313 }
314 return E_OK;
315 }
316 }
317
318 if (s->s_accept) {
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000319#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000320#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 if (d->d_name[0] == 'i' &&
322 strcmp(d->d_name, "import_stmt") == 0)
323 future_hack(ps);
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000324#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000325#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 /* Pop this dfa and try again */
327 s_pop(&ps->p_stack);
328 D(printf(" Pop ...\n"));
329 if (s_empty(&ps->p_stack)) {
330 D(printf(" Error: bottom of stack.\n"));
331 return E_SYNTAX;
332 }
333 continue;
334 }
335
336 /* Stuck, report syntax error */
337 D(printf(" Error.\n"));
338 if (expected_ret) {
339 if (s->s_lower == s->s_upper - 1) {
340 /* Only one possible expected token */
341 *expected_ret = ps->p_grammar->
342 g_ll.ll_label[s->s_lower].lb_type;
343 }
344 else
345 *expected_ret = -1;
346 }
347 return E_SYNTAX;
348 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000349}
350
351
Guido van Rossum408027e1996-12-30 16:17:54 +0000352#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000353
354/* DEBUG OUTPUT */
355
356void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000357dumptree(grammar *g, node *n)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000358{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 int i;
360
361 if (n == NULL)
362 printf("NIL");
363 else {
364 label l;
365 l.lb_type = TYPE(n);
366 l.lb_str = STR(n);
367 printf("%s", PyGrammar_LabelRepr(&l));
368 if (ISNONTERMINAL(TYPE(n))) {
369 printf("(");
370 for (i = 0; i < NCH(n); i++) {
371 if (i > 0)
372 printf(",");
373 dumptree(g, CHILD(n, i));
374 }
375 printf(")");
376 }
377 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000378}
379
380void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000381showtree(grammar *g, node *n)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000382{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 int i;
384
385 if (n == NULL)
386 return;
387 if (ISNONTERMINAL(TYPE(n))) {
388 for (i = 0; i < NCH(n); i++)
389 showtree(g, CHILD(n, i));
390 }
391 else if (ISTERMINAL(TYPE(n))) {
392 printf("%s", _PyParser_TokenNames[TYPE(n)]);
393 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
394 printf("(%s)", STR(n));
395 printf(" ");
396 }
397 else
398 printf("? ");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000399}
400
401void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000402printtree(parser_state *ps)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000403{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 if (Py_DebugFlag) {
405 printf("Parse tree:\n");
406 dumptree(ps->p_grammar, ps->p_tree);
407 printf("\n");
408 printf("Tokens:\n");
409 showtree(ps->p_grammar, ps->p_tree);
410 printf("\n");
411 }
412 printf("Listing:\n");
413 PyNode_ListTree(ps->p_tree);
414 printf("\n");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000415}
416
Guido van Rossum408027e1996-12-30 16:17:54 +0000417#endif /* Py_DEBUG */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000418
419/*
420
421Description
422-----------
423
424The parser's interface is different than usual: the function addtoken()
425must be called for each token in the input. This makes it possible to
426turn it into an incremental parsing system later. The parsing system
427constructs a parse tree as it goes.
428
429A parsing rule is represented as a Deterministic Finite-state Automaton
430(DFA). A node in a DFA represents a state of the parser; an arc represents
431a transition. Transitions are either labeled with terminal symbols or
432with non-terminals. When the parser decides to follow an arc labeled
433with a non-terminal, it is invoked recursively with the DFA representing
434the parsing rule for that as its initial state; when that DFA accepts,
435the parser that invoked it continues. The parse tree constructed by the
436recursively called parser is inserted as a child in the current parse tree.
437
438The DFA's can be constructed automatically from a more conventional
439language description. An extended LL(1) grammar (ELL(1)) is suitable.
440Certain restrictions make the parser's life easier: rules that can produce
441the empty string should be outlawed (there are other ways to put loops
442or optional parts in the language). To avoid the need to construct
443FIRST sets, we can require that all but the last alternative of a rule
444(really: arc going out of a DFA's state) must begin with a terminal
445symbol.
446
447As an example, consider this grammar:
448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000449expr: term (OP term)*
450term: CONSTANT | '(' expr ')'
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000451
452The DFA corresponding to the rule for expr is:
453
454------->.---term-->.------->
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000455 ^ |
456 | |
457 \----OP----/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000458
459The parse tree generated for the input a+b is:
460
461(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
462
463*/