blob: 324d9b6548c77b7fabe03ace2bf3b5f56b3a3ebb [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Guido van Rossum3f5da241990-12-20 15:06:42 +00004#include "pgenheaders.h"
5
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00006#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00008#include "tokenizer.h"
9#include "errcode.h"
10
Tim Petersdbd9ba62000-07-09 03:09:57 +000011extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000012/* Return malloc'ed string including trailing \n;
13 empty malloc'ed string for EOF;
14 NULL if interrupted */
15
Guido van Rossum4fe87291992-02-26 15:24:44 +000016/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000018
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000019/* Convert a possibly signed character to a nonnegative int */
20/* XXX This assumes characters are 8 bits wide */
21#ifdef __CHAR_UNSIGNED__
22#define Py_CHARMASK(c) (c)
23#else
24#define Py_CHARMASK(c) ((c) & 0xff)
25#endif
26
Guido van Rossum3f5da241990-12-20 15:06:42 +000027/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000028static struct tok_state *tok_new(void);
29static int tok_nextc(struct tok_state *tok);
30static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000031
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032/* Token names */
33
Guido van Rossum86bea461997-04-29 21:03:06 +000034char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035 "ENDMARKER",
36 "NAME",
37 "NUMBER",
38 "STRING",
39 "NEWLINE",
40 "INDENT",
41 "DEDENT",
42 "LPAR",
43 "RPAR",
44 "LSQB",
45 "RSQB",
46 "COLON",
47 "COMMA",
48 "SEMI",
49 "PLUS",
50 "MINUS",
51 "STAR",
52 "SLASH",
53 "VBAR",
54 "AMPER",
55 "LESS",
56 "GREATER",
57 "EQUAL",
58 "DOT",
59 "PERCENT",
60 "BACKQUOTE",
61 "LBRACE",
62 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000063 "EQEQUAL",
64 "NOTEQUAL",
65 "LESSEQUAL",
66 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000067 "TILDE",
68 "CIRCUMFLEX",
69 "LEFTSHIFT",
70 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000071 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000072 "PLUSEQUAL",
73 "MINEQUAL",
74 "STAREQUAL",
75 "SLASHEQUAL",
76 "PERCENTEQUAL",
77 "AMPEREQUAL",
78 "VBAREQUAL",
79 "CIRCUMFLEXEQUAL",
80 "LEFTSHIFTEQUAL",
81 "RIGHTSHIFTEQUAL",
82 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000083 "DOUBLESLASH",
84 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000085 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000086 "OP",
87 "<ERRORTOKEN>",
88 "<N_TOKENS>"
89};
90
91
92/* Create and initialize a new tok_state structure */
93
94static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000095tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096{
Guido van Rossum86bea461997-04-29 21:03:06 +000097 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 if (tok == NULL)
99 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000100 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000101 tok->done = E_OK;
102 tok->fp = NULL;
103 tok->tabsize = TABSIZE;
104 tok->indent = 0;
105 tok->indstack[0] = 0;
106 tok->atbol = 1;
107 tok->pendin = 0;
108 tok->prompt = tok->nextprompt = NULL;
109 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000110 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000111 tok->filename = NULL;
112 tok->altwarning = 0;
113 tok->alterror = 0;
114 tok->alttabsize = 1;
115 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000116 return tok;
117}
118
119
120/* Set up tokenizer for string */
121
122struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000123PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000124{
125 struct tok_state *tok = tok_new();
126 if (tok == NULL)
127 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000128 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000129 return tok;
130}
131
132
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000133/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000134
135struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000136PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137{
138 struct tok_state *tok = tok_new();
139 if (tok == NULL)
140 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000141 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
142 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000143 return NULL;
144 }
145 tok->cur = tok->inp = tok->buf;
146 tok->end = tok->buf + BUFSIZ;
147 tok->fp = fp;
148 tok->prompt = ps1;
149 tok->nextprompt = ps2;
150 return tok;
151}
152
153
154/* Free a tok_state structure */
155
156void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000157PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000158{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000160 PyMem_DEL(tok->buf);
161 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000168tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000169{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000170 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000171 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000172 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000173 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000174 if (tok->done != E_OK)
175 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000176 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000177 char *end = strchr(tok->inp, '\n');
178 if (end != NULL)
179 end++;
180 else {
181 end = strchr(tok->inp, '\0');
182 if (end == tok->inp) {
183 tok->done = E_EOF;
184 return EOF;
185 }
186 }
187 if (tok->start == NULL)
188 tok->buf = tok->cur;
189 tok->lineno++;
190 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000191 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000192 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000194 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000195 if (tok->nextprompt != NULL)
196 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000197 if (new == NULL)
198 tok->done = E_INTR;
199 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000201 tok->done = E_EOF;
202 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000203 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000204 size_t start = tok->start - tok->buf;
205 size_t oldlen = tok->cur - tok->buf;
206 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000207 char *buf = tok->buf;
208 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000209 tok->lineno++;
210 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000211 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000212 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000213 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000214 tok->done = E_NOMEM;
215 return EOF;
216 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000217 tok->buf = buf;
218 tok->cur = tok->buf + oldlen;
219 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000221 tok->inp = tok->buf + newlen;
222 tok->end = tok->inp + 1;
223 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000224 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000225 else {
226 tok->lineno++;
227 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000228 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000229 tok->buf = new;
230 tok->cur = tok->buf;
231 tok->inp = strchr(tok->buf, '\0');
232 tok->end = tok->inp + 1;
233 }
234 }
235 else {
236 int done = 0;
237 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000238 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000239 if (tok->start == NULL) {
240 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000241 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000242 if (tok->buf == NULL) {
243 tok->done = E_NOMEM;
244 return EOF;
245 }
246 tok->end = tok->buf + BUFSIZ;
247 }
248 if (fgets(tok->buf, (int)(tok->end - tok->buf),
249 tok->fp) == NULL) {
250 tok->done = E_EOF;
251 done = 1;
252 }
253 else {
254 tok->done = E_OK;
255 tok->inp = strchr(tok->buf, '\0');
256 done = tok->inp[-1] == '\n';
257 }
258 }
259 else {
260 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000261 if (feof(tok->fp)) {
262 tok->done = E_EOF;
263 done = 1;
264 }
265 else
266 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000267 }
268 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000269 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000270 while (!done) {
271 int curstart = tok->start == NULL ? -1 :
272 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000273 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000274 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000275 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000276 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000277 if (newbuf == NULL) {
278 tok->done = E_NOMEM;
279 tok->cur = tok->inp;
280 return EOF;
281 }
282 tok->buf = newbuf;
283 tok->inp = tok->buf + curvalid;
284 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000285 tok->start = curstart < 0 ? NULL :
286 tok->buf + curstart;
287 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000288 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000289 tok->fp) == NULL) {
290 /* Last line does not end in \n,
291 fake one */
292 strcpy(tok->inp, "\n");
293 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000294 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000295 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000296 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000297 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000298#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000299 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000300 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000301 pt = tok->inp - 2;
302 if (pt >= tok->buf && *pt == '\r') {
303 *pt++ = '\n';
304 *pt = '\0';
305 tok->inp = pt;
306 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000307#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000308 }
309 if (tok->done != E_OK) {
310 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000311 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000312 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000313 return EOF;
314 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000315 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000316 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000317}
318
319
320/* Back-up one character */
321
322static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000323tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000324{
325 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000326 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000327 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000328 if (*tok->cur != c)
329 *tok->cur = c;
330 }
331}
332
333
334/* Return the token corresponding to a single character */
335
336int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000337PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000338{
339 switch (c) {
340 case '(': return LPAR;
341 case ')': return RPAR;
342 case '[': return LSQB;
343 case ']': return RSQB;
344 case ':': return COLON;
345 case ',': return COMMA;
346 case ';': return SEMI;
347 case '+': return PLUS;
348 case '-': return MINUS;
349 case '*': return STAR;
350 case '/': return SLASH;
351 case '|': return VBAR;
352 case '&': return AMPER;
353 case '<': return LESS;
354 case '>': return GREATER;
355 case '=': return EQUAL;
356 case '.': return DOT;
357 case '%': return PERCENT;
358 case '`': return BACKQUOTE;
359 case '{': return LBRACE;
360 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000361 case '^': return CIRCUMFLEX;
362 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000363 default: return OP;
364 }
365}
366
367
Guido van Rossumfbab9051991-10-20 20:25:03 +0000368int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000369PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000370{
371 switch (c1) {
372 case '=':
373 switch (c2) {
374 case '=': return EQEQUAL;
375 }
376 break;
377 case '!':
378 switch (c2) {
379 case '=': return NOTEQUAL;
380 }
381 break;
382 case '<':
383 switch (c2) {
384 case '>': return NOTEQUAL;
385 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000386 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000387 }
388 break;
389 case '>':
390 switch (c2) {
391 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000392 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000393 }
394 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000395 case '+':
396 switch (c2) {
397 case '=': return PLUSEQUAL;
398 }
399 break;
400 case '-':
401 switch (c2) {
402 case '=': return MINEQUAL;
403 }
404 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000405 case '*':
406 switch (c2) {
407 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000408 case '=': return STAREQUAL;
409 }
410 break;
411 case '/':
412 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000413 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000414 case '=': return SLASHEQUAL;
415 }
416 break;
417 case '|':
418 switch (c2) {
419 case '=': return VBAREQUAL;
420 }
421 break;
422 case '%':
423 switch (c2) {
424 case '=': return PERCENTEQUAL;
425 }
426 break;
427 case '&':
428 switch (c2) {
429 case '=': return AMPEREQUAL;
430 }
431 break;
432 case '^':
433 switch (c2) {
434 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000435 }
436 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000437 }
438 return OP;
439}
440
Thomas Wouters434d0822000-08-24 20:11:32 +0000441int
442PyToken_ThreeChars(int c1, int c2, int c3)
443{
444 switch (c1) {
445 case '<':
446 switch (c2) {
447 case '<':
448 switch (c3) {
449 case '=':
450 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000451 }
452 break;
453 }
454 break;
455 case '>':
456 switch (c2) {
457 case '>':
458 switch (c3) {
459 case '=':
460 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000461 }
462 break;
463 }
464 break;
465 case '*':
466 switch (c2) {
467 case '*':
468 switch (c3) {
469 case '=':
470 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000471 }
472 break;
473 }
474 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000475 case '/':
476 switch (c2) {
477 case '/':
478 switch (c3) {
479 case '=':
480 return DOUBLESLASHEQUAL;
481 }
482 break;
483 }
484 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000485 }
486 return OP;
487}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000488
Guido van Rossum926f13a1998-04-09 21:38:06 +0000489static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000490indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000491{
492 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000493 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000494 tok->cur = tok->inp;
495 return 1;
496 }
497 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000498 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
499 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000500 tok->altwarning = 0;
501 }
502 return 0;
503}
504
505
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000506/* Get next token, after space stripping etc. */
507
508int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000509PyTokenizer_Get(register struct tok_state *tok, char **p_start,
510 char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000511{
512 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000513 int blankline;
514
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000515 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000516 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000517 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000518 blankline = 0;
519
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000520 /* Get indentation level */
521 if (tok->atbol) {
522 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000523 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000524 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000525 for (;;) {
526 c = tok_nextc(tok);
527 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000528 col++, altcol++;
529 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000530 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000531 altcol = (altcol/tok->alttabsize + 1)
532 * tok->alttabsize;
533 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000534 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000535 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000536 else
537 break;
538 }
539 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000540 if (c == '#' || c == '\n') {
541 /* Lines with only whitespace and/or comments
542 shouldn't affect the indentation and are
543 not passed to the parser as NEWLINE tokens,
544 except *totally* empty lines in interactive
545 mode, which signal the end of a command group. */
546 if (col == 0 && c == '\n' && tok->prompt != NULL)
547 blankline = 0; /* Let it through */
548 else
549 blankline = 1; /* Ignore completely */
550 /* We can't jump back right here since we still
551 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000552 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000553 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000554 if (col == tok->indstack[tok->indent]) {
555 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000556 if (altcol != tok->altindstack[tok->indent]) {
557 if (indenterror(tok))
558 return ERRORTOKEN;
559 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000560 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000561 else if (col > tok->indstack[tok->indent]) {
562 /* Indent -- always one */
563 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000564 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000565 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000566 return ERRORTOKEN;
567 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000568 if (altcol <= tok->altindstack[tok->indent]) {
569 if (indenterror(tok))
570 return ERRORTOKEN;
571 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000572 tok->pendin++;
573 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000574 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000575 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000576 else /* col < tok->indstack[tok->indent] */ {
577 /* Dedent -- any number, must be consistent */
578 while (tok->indent > 0 &&
579 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000580 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000581 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000582 }
583 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000584 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000585 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000586 return ERRORTOKEN;
587 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000588 if (altcol != tok->altindstack[tok->indent]) {
589 if (indenterror(tok))
590 return ERRORTOKEN;
591 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000592 }
593 }
594 }
595
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000596 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000597
598 /* Return pending indents/dedents */
599 if (tok->pendin != 0) {
600 if (tok->pendin < 0) {
601 tok->pendin++;
602 return DEDENT;
603 }
604 else {
605 tok->pendin--;
606 return INDENT;
607 }
608 }
609
610 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000611 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000612 /* Skip spaces */
613 do {
614 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000615 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000616
617 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000618 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619
Guido van Rossumab5ca152000-03-31 00:52:27 +0000620 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000621 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000622 static char *tabforms[] = {
623 "tab-width:", /* Emacs */
624 ":tabstop=", /* vim, full form */
625 ":ts=", /* vim, abbreviated form */
626 "set tabsize=", /* will vi never die? */
627 /* more templates can be added here to support other editors */
628 };
629 char cbuf[80];
630 char *tp, **cp;
631 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000632 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000633 *tp++ = c = tok_nextc(tok);
634 } while (c != EOF && c != '\n' &&
635 tp - cbuf + 1 < sizeof(cbuf));
636 *tp = '\0';
637 for (cp = tabforms;
638 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
639 cp++) {
640 if ((tp = strstr(cbuf, *cp))) {
641 int newsize = atoi(tp + strlen(*cp));
642
643 if (newsize >= 1 && newsize <= 40) {
644 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000645 if (Py_VerboseFlag)
646 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000647 "Tab size set to %d\n",
648 newsize);
649 }
650 }
651 }
652 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000654 }
655
656 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000657 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000659 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660
661 /* Identifier (most frequent token!) */
662 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000663 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000664 switch (c) {
665 case 'r':
666 case 'R':
667 c = tok_nextc(tok);
668 if (c == '"' || c == '\'')
669 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000670 break;
671 case 'u':
672 case 'U':
673 c = tok_nextc(tok);
674 if (c == 'r' || c == 'R')
675 c = tok_nextc(tok);
676 if (c == '"' || c == '\'')
677 goto letter_quote;
678 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000679 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000680 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000682 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000684 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000685 *p_end = tok->cur;
686 return NAME;
687 }
688
689 /* Newline */
690 if (c == '\n') {
691 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000692 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000693 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000694 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000695 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
696 return NEWLINE;
697 }
698
Guido van Rossum2d45be11997-04-11 19:16:25 +0000699#ifdef macintosh
700 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000701 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000702 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000703 tok->done = E_TOKEN;
704 tok->cur = tok->inp;
705 return ERRORTOKEN;
706 }
707#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000708 /* Period or number starting with period? */
709 if (c == '.') {
710 c = tok_nextc(tok);
711 if (isdigit(c)) {
712 goto fraction;
713 }
714 else {
715 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000716 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000717 *p_end = tok->cur;
718 return DOT;
719 }
720 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000721
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722 /* Number */
723 if (isdigit(c)) {
724 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +0000725 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726 c = tok_nextc(tok);
727 if (c == '.')
728 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000729#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000730 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000731 goto imaginary;
732#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000733 if (c == 'x' || c == 'X') {
734 /* Hex */
735 do {
736 c = tok_nextc(tok);
737 } while (isxdigit(c));
738 }
739 else {
Tim Petersd507dab2001-08-30 20:51:59 +0000740 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741 /* Octal; c is first char of it */
742 /* There's no 'isoctdigit' macro, sigh */
743 while ('0' <= c && c < '8') {
744 c = tok_nextc(tok);
745 }
Tim Petersd507dab2001-08-30 20:51:59 +0000746 if (isdigit(c)) {
747 found_decimal = 1;
748 do {
749 c = tok_nextc(tok);
750 } while (isdigit(c));
751 }
752 if (c == '.')
753 goto fraction;
754 else if (c == 'e' || c == 'E')
755 goto exponent;
756#ifndef WITHOUT_COMPLEX
757 else if (c == 'j' || c == 'J')
758 goto imaginary;
759#endif
760 else if (found_decimal) {
761 tok->done = E_TOKEN;
762 tok_backup(tok, c);
763 return ERRORTOKEN;
764 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000766 if (c == 'l' || c == 'L')
767 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768 }
769 else {
770 /* Decimal */
771 do {
772 c = tok_nextc(tok);
773 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000774 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000776 else {
Tim Peters9aa70d92001-08-27 19:19:28 +0000777 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +0000778 if (c == '.') {
779 fraction:
780 /* Fraction */
781 do {
782 c = tok_nextc(tok);
783 } while (isdigit(c));
784 }
785 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +0000786 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +0000787 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000789 if (c == '+' || c == '-')
790 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +0000791 if (!isdigit(c)) {
792 tok->done = E_TOKEN;
793 tok_backup(tok, c);
794 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +0000795 }
Tim Peters9aa70d92001-08-27 19:19:28 +0000796 do {
797 c = tok_nextc(tok);
798 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000800#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000801 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000802 /* Imaginary part */
803 imaginary:
804 c = tok_nextc(tok);
805#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806 }
807 }
808 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000809 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810 *p_end = tok->cur;
811 return NUMBER;
812 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000813
814 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000815 /* String */
816 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000817 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 int quote = c;
819 int triple = 0;
820 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821 for (;;) {
822 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 if (c == '\n') {
824 if (!triple) {
825 tok->done = E_TOKEN;
826 tok_backup(tok, c);
827 return ERRORTOKEN;
828 }
829 tripcount = 0;
830 }
831 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000832 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000833 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000834 return ERRORTOKEN;
835 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 else if (c == quote) {
837 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000838 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 c = tok_nextc(tok);
840 if (c == quote) {
841 triple = 1;
842 tripcount = 0;
843 continue;
844 }
845 tok_backup(tok, c);
846 }
847 if (!triple || tripcount == 3)
848 break;
849 }
850 else if (c == '\\') {
851 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000853 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000855 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856 return ERRORTOKEN;
857 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000859 else
860 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000863 *p_end = tok->cur;
864 return STRING;
865 }
866
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000867 /* Line continuation */
868 if (c == '\\') {
869 c = tok_nextc(tok);
870 if (c != '\n') {
871 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000872 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873 return ERRORTOKEN;
874 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875 goto again; /* Read next line */
876 }
877
Guido van Rossumfbab9051991-10-20 20:25:03 +0000878 /* Check for two-character token */
879 {
880 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000881 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000882 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +0000883 int c3 = tok_nextc(tok);
884 int token3 = PyToken_ThreeChars(c, c2, c3);
885 if (token3 != OP) {
886 token = token3;
887 } else {
888 tok_backup(tok, c3);
889 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000891 *p_end = tok->cur;
892 return token;
893 }
894 tok_backup(tok, c2);
895 }
896
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000897 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000898 switch (c) {
899 case '(':
900 case '[':
901 case '{':
902 tok->level++;
903 break;
904 case ')':
905 case ']':
906 case '}':
907 tok->level--;
908 break;
909 }
910
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000911 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000912 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000914 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000915}
916
917
Guido van Rossum408027e1996-12-30 16:17:54 +0000918#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919
920void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000921tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000922{
Guido van Rossum86bea461997-04-29 21:03:06 +0000923 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000924 if (type == NAME || type == NUMBER || type == STRING || type == OP)
925 printf("(%.*s)", (int)(end - start), start);
926}
927
928#endif