blob: fa4a8f011ff525193b31dd5187ace3ebc61a385b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Parser implementation */
3
4/* For a description, see the comments at end of this file */
5
6/* XXX To do: error recovery */
7
Tim Peters1ca12962001-12-04 03:18:48 +00008#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00009#include "pgenheaders.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "token.h"
11#include "grammar.h"
12#include "node.h"
13#include "parser.h"
14#include "errcode.h"
Guido van Rossumdcfcd142019-01-31 03:40:27 -080015#include "graminit.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000016
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017
Guido van Rossum408027e1996-12-30 16:17:54 +000018#ifdef Py_DEBUG
Guido van Rossum86bea461997-04-29 21:03:06 +000019extern int Py_DebugFlag;
20#define D(x) if (!Py_DebugFlag); else x
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000021#else
22#define D(x)
23#endif
24
25
26/* STACK DATA TYPE */
27
Tim Petersdbd9ba62000-07-09 03:09:57 +000028static void s_reset(stack *);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
30static void
Thomas Wouters23c9e002000-07-22 19:20:54 +000031s_reset(stack *s)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000033 s->s_top = &s->s_base[MAXSTACK];
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000034}
35
36#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
37
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020039s_push(stack *s, dfa *d, node *parent)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020041 stackentry *top;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000042 if (s->s_top == s->s_base) {
43 fprintf(stderr, "s_push: parser stack overflow\n");
44 return E_NOMEM;
45 }
46 top = --s->s_top;
47 top->s_dfa = d;
48 top->s_parent = parent;
49 top->s_state = 0;
50 return 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051}
52
Guido van Rossum408027e1996-12-30 16:17:54 +000053#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000054
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056s_pop(stack *s)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 if (s_empty(s))
59 Py_FatalError("s_pop: parser stack underflow -- FATAL");
60 s->s_top++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000061}
62
Guido van Rossum408027e1996-12-30 16:17:54 +000063#else /* !Py_DEBUG */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000064
65#define s_pop(s) (s)->s_top++
66
67#endif
68
69
70/* PARSER CREATION */
71
72parser_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000073PyParser_New(grammar *g, int start)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000074{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 parser_state *ps;
76
77 if (!g->g_accel)
78 PyGrammar_AddAccelerators(g);
79 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
80 if (ps == NULL)
81 return NULL;
82 ps->p_grammar = g;
Thomas Wouters34aa7ba2006-02-28 19:02:24 +000083#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000084 ps->p_flags = 0;
Neil Schemenauerc155dd42002-03-22 23:38:11 +000085#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000086 ps->p_tree = PyNode_New(start);
87 if (ps->p_tree == NULL) {
88 PyMem_FREE(ps);
89 return NULL;
90 }
91 s_reset(&ps->p_stack);
92 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
93 return ps;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000094}
95
96void
Thomas Wouters23c9e002000-07-22 19:20:54 +000097PyParser_Delete(parser_state *ps)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 /* NB If you want to save the parse tree,
100 you must set p_tree to NULL before calling delparser! */
101 PyNode_Free(ps->p_tree);
102 PyMem_FREE(ps);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103}
104
105
106/* PARSER STACK OPERATIONS */
107
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108static int
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000109shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
110 int end_lineno, int end_col_offset)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 int err;
113 assert(!s_empty(s));
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000114 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
115 end_lineno, end_col_offset);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 if (err)
117 return err;
118 s->s_top->s_state = newstate;
119 return 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120}
121
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122static int
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000123push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset,
124 int end_lineno, int end_col_offset)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 int err;
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200127 node *n;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000128 n = s->s_top->s_parent;
129 assert(!s_empty(s));
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000130 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
131 end_lineno, end_col_offset);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 if (err)
133 return err;
134 s->s_top->s_state = newstate;
135 return s_push(s, d, CHILD(n, NCH(n)-1));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136}
137
138
139/* PARSER PROPER */
140
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000141static int
Serhiy Storchakac6792272013-10-19 21:03:34 +0300142classify(parser_state *ps, int type, const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000143{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 grammar *g = ps->p_grammar;
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200145 int n = g->g_ll.ll_nlabels;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146
147 if (type == NAME) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200148 label *l = g->g_ll.ll_label;
149 int i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 for (i = n; i > 0; i--, l++) {
151 if (l->lb_type != NAME || l->lb_str == NULL ||
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300152 l->lb_str[0] != str[0] ||
153 strcmp(l->lb_str, str) != 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 continue;
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000155#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000156#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000157 /* Leaving this in as an example */
158 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300159 if (str[0] == 'w' && strcmp(str, "with") == 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 break; /* not a keyword yet */
Berker Peksag2a65ecb2016-03-28 00:45:28 +0300161 else if (str[0] == 'a' && strcmp(str, "as") == 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 break; /* not a keyword yet */
163 }
Thomas Wouters8ae12952006-02-28 22:42:15 +0000164#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000165#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 D(printf("It's a keyword\n"));
167 return n - i;
168 }
169 }
170
171 {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200172 label *l = g->g_ll.ll_label;
173 int i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i = n; i > 0; i--, l++) {
175 if (l->lb_type == type && l->lb_str == NULL) {
176 D(printf("It's a token we know\n"));
177 return n - i;
178 }
179 }
180 }
181
182 D(printf("Illegal token\n"));
183 return -1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000184}
185
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000186#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000187#if 0
Guido van Rossum45aecf42006-03-15 04:58:47 +0000188/* Leaving this in as an example */
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000189static void
190future_hack(parser_state *ps)
191{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 node *n = ps->p_stack.s_top->s_parent;
193 node *ch, *cch;
194 int i;
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000195
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 /* from __future__ import ..., must have at least 4 children */
197 n = CHILD(n, 0);
198 if (NCH(n) < 4)
199 return;
200 ch = CHILD(n, 0);
201 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
202 return;
203 ch = CHILD(n, 1);
204 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
205 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
206 return;
207 ch = CHILD(n, 3);
208 /* ch can be a star, a parenthesis or import_as_names */
209 if (TYPE(ch) == STAR)
210 return;
211 if (TYPE(ch) == LPAR)
212 ch = CHILD(n, 4);
213
214 for (i = 0; i < NCH(ch); i += 2) {
215 cch = CHILD(ch, i);
216 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
217 char *str_ch = STR(CHILD(cch, 0));
218 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
219 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
220 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
221 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
222 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
223 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
224 }
225 }
226 }
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000227}
Brett Cannone3944a52009-04-01 05:08:41 +0000228#endif
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000229#endif /* future keyword */
Guido van Rossumb09f7ed2001-07-15 21:08:29 +0000230
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000231int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200232PyParser_AddToken(parser_state *ps, int type, char *str,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000233 int lineno, int col_offset,
234 int end_lineno, int end_col_offset,
235 int *expected_ret)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000236{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200237 int ilabel;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 int err;
239
240 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
241
242 /* Find out which label this token is */
243 ilabel = classify(ps, type, str);
244 if (ilabel < 0)
245 return E_SYNTAX;
246
247 /* Loop until the token is shifted or an error occurred */
248 for (;;) {
249 /* Fetch the current dfa and state */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200250 dfa *d = ps->p_stack.s_top->s_dfa;
251 state *s = &d->d_state[ps->p_stack.s_top->s_state];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252
253 D(printf(" DFA '%s', state %d:",
254 d->d_name, ps->p_stack.s_top->s_state));
255
256 /* Check accelerator */
257 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200258 int x = s->s_accel[ilabel - s->s_lower];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 if (x != -1) {
260 if (x & (1<<7)) {
261 /* Push non-terminal */
262 int nt = (x >> 8) + NT_OFFSET;
263 int arrow = x & ((1<<7)-1);
Guido van Rossumdcfcd142019-01-31 03:40:27 -0800264 dfa *d1;
265 if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
266 /* When parsing type comments is not requested,
267 we can provide better errors about bad indentation
268 by using 'suite' for the body of a funcdef */
269 D(printf(" [switch func_body_suite to suite]"));
270 nt = suite;
271 }
272 d1 = PyGrammar_FindDFA(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 ps->p_grammar, nt);
274 if ((err = push(&ps->p_stack, nt, d1,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000275 arrow, lineno, col_offset,
276 end_lineno, end_col_offset)) > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000277 D(printf(" MemError: push\n"));
278 return err;
279 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -0800280 D(printf(" Push '%s'\n", d1->d_name));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 continue;
282 }
283
284 /* Shift the token */
285 if ((err = shift(&ps->p_stack, type, str,
Ivan Levkivskyi9932a222019-01-22 11:18:22 +0000286 x, lineno, col_offset,
287 end_lineno, end_col_offset)) > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 D(printf(" MemError: shift.\n"));
289 return err;
290 }
291 D(printf(" Shift.\n"));
292 /* Pop while we are in an accept-only state */
293 while (s = &d->d_state
294 [ps->p_stack.s_top->s_state],
295 s->s_accept && s->s_narcs == 1) {
296 D(printf(" DFA '%s', state %d: "
297 "Direct pop.\n",
298 d->d_name,
299 ps->p_stack.s_top->s_state));
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000300#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000301#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000302 if (d->d_name[0] == 'i' &&
303 strcmp(d->d_name,
304 "import_stmt") == 0)
305 future_hack(ps);
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000306#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000307#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 s_pop(&ps->p_stack);
309 if (s_empty(&ps->p_stack)) {
310 D(printf(" ACCEPT.\n"));
311 return E_DONE;
312 }
313 d = ps->p_stack.s_top->s_dfa;
314 }
315 return E_OK;
316 }
317 }
318
319 if (s->s_accept) {
Thomas Wouters34aa7ba2006-02-28 19:02:24 +0000320#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
Brett Cannone3944a52009-04-01 05:08:41 +0000321#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 if (d->d_name[0] == 'i' &&
323 strcmp(d->d_name, "import_stmt") == 0)
324 future_hack(ps);
Neil Schemenauerc155dd42002-03-22 23:38:11 +0000325#endif
Brett Cannone3944a52009-04-01 05:08:41 +0000326#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000327 /* Pop this dfa and try again */
328 s_pop(&ps->p_stack);
329 D(printf(" Pop ...\n"));
330 if (s_empty(&ps->p_stack)) {
331 D(printf(" Error: bottom of stack.\n"));
332 return E_SYNTAX;
333 }
334 continue;
335 }
336
337 /* Stuck, report syntax error */
338 D(printf(" Error.\n"));
339 if (expected_ret) {
340 if (s->s_lower == s->s_upper - 1) {
341 /* Only one possible expected token */
342 *expected_ret = ps->p_grammar->
343 g_ll.ll_label[s->s_lower].lb_type;
344 }
345 else
346 *expected_ret = -1;
347 }
348 return E_SYNTAX;
349 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000350}
351
352
Guido van Rossum408027e1996-12-30 16:17:54 +0000353#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000354
355/* DEBUG OUTPUT */
356
357void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000358dumptree(grammar *g, node *n)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000359{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 int i;
361
362 if (n == NULL)
363 printf("NIL");
364 else {
365 label l;
366 l.lb_type = TYPE(n);
367 l.lb_str = STR(n);
368 printf("%s", PyGrammar_LabelRepr(&l));
369 if (ISNONTERMINAL(TYPE(n))) {
370 printf("(");
371 for (i = 0; i < NCH(n); i++) {
372 if (i > 0)
373 printf(",");
374 dumptree(g, CHILD(n, i));
375 }
376 printf(")");
377 }
378 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000379}
380
381void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000382showtree(grammar *g, node *n)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000383{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000384 int i;
385
386 if (n == NULL)
387 return;
388 if (ISNONTERMINAL(TYPE(n))) {
389 for (i = 0; i < NCH(n); i++)
390 showtree(g, CHILD(n, i));
391 }
392 else if (ISTERMINAL(TYPE(n))) {
393 printf("%s", _PyParser_TokenNames[TYPE(n)]);
394 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
395 printf("(%s)", STR(n));
396 printf(" ");
397 }
398 else
399 printf("? ");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000400}
401
402void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000403printtree(parser_state *ps)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000404{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 if (Py_DebugFlag) {
406 printf("Parse tree:\n");
407 dumptree(ps->p_grammar, ps->p_tree);
408 printf("\n");
409 printf("Tokens:\n");
410 showtree(ps->p_grammar, ps->p_tree);
411 printf("\n");
412 }
413 printf("Listing:\n");
414 PyNode_ListTree(ps->p_tree);
415 printf("\n");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000416}
417
Guido van Rossum408027e1996-12-30 16:17:54 +0000418#endif /* Py_DEBUG */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000419
420/*
421
422Description
423-----------
424
425The parser's interface is different than usual: the function addtoken()
426must be called for each token in the input. This makes it possible to
427turn it into an incremental parsing system later. The parsing system
428constructs a parse tree as it goes.
429
430A parsing rule is represented as a Deterministic Finite-state Automaton
431(DFA). A node in a DFA represents a state of the parser; an arc represents
432a transition. Transitions are either labeled with terminal symbols or
433with non-terminals. When the parser decides to follow an arc labeled
434with a non-terminal, it is invoked recursively with the DFA representing
435the parsing rule for that as its initial state; when that DFA accepts,
436the parser that invoked it continues. The parse tree constructed by the
437recursively called parser is inserted as a child in the current parse tree.
438
439The DFA's can be constructed automatically from a more conventional
440language description. An extended LL(1) grammar (ELL(1)) is suitable.
441Certain restrictions make the parser's life easier: rules that can produce
442the empty string should be outlawed (there are other ways to put loops
443or optional parts in the language). To avoid the need to construct
444FIRST sets, we can require that all but the last alternative of a rule
445(really: arc going out of a DFA's state) must begin with a terminal
446symbol.
447
448As an example, consider this grammar:
449
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000450expr: term (OP term)*
451term: CONSTANT | '(' expr ')'
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000452
453The DFA corresponding to the rule for expr is:
454
455------->.---term-->.------->
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 ^ |
457 | |
458 \----OP----/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000459
460The parse tree generated for the input a+b is:
461
462(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
463
464*/