blob: 6ae5084c11f62b7256fdc57ecb44e429f84fefc8 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Guido van Rossum3f5da241990-12-20 15:06:42 +00004#include "pgenheaders.h"
5
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00006#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00008#include "tokenizer.h"
9#include "errcode.h"
10
Tim Petersdbd9ba62000-07-09 03:09:57 +000011extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000012/* Return malloc'ed string including trailing \n;
13 empty malloc'ed string for EOF;
14 NULL if interrupted */
15
Guido van Rossum4fe87291992-02-26 15:24:44 +000016/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000018
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000019/* Convert a possibly signed character to a nonnegative int */
20/* XXX This assumes characters are 8 bits wide */
21#ifdef __CHAR_UNSIGNED__
22#define Py_CHARMASK(c) (c)
23#else
24#define Py_CHARMASK(c) ((c) & 0xff)
25#endif
26
Guido van Rossum3f5da241990-12-20 15:06:42 +000027/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000028static struct tok_state *tok_new(void);
29static int tok_nextc(struct tok_state *tok);
30static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000031
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032/* Token names */
33
Guido van Rossum86bea461997-04-29 21:03:06 +000034char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035 "ENDMARKER",
36 "NAME",
37 "NUMBER",
38 "STRING",
39 "NEWLINE",
40 "INDENT",
41 "DEDENT",
42 "LPAR",
43 "RPAR",
44 "LSQB",
45 "RSQB",
46 "COLON",
47 "COMMA",
48 "SEMI",
49 "PLUS",
50 "MINUS",
51 "STAR",
52 "SLASH",
53 "VBAR",
54 "AMPER",
55 "LESS",
56 "GREATER",
57 "EQUAL",
58 "DOT",
59 "PERCENT",
60 "BACKQUOTE",
61 "LBRACE",
62 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000063 "EQEQUAL",
64 "NOTEQUAL",
65 "LESSEQUAL",
66 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000067 "TILDE",
68 "CIRCUMFLEX",
69 "LEFTSHIFT",
70 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000071 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000072 "PLUSEQUAL",
73 "MINEQUAL",
74 "STAREQUAL",
75 "SLASHEQUAL",
76 "PERCENTEQUAL",
77 "AMPEREQUAL",
78 "VBAREQUAL",
79 "CIRCUMFLEXEQUAL",
80 "LEFTSHIFTEQUAL",
81 "RIGHTSHIFTEQUAL",
82 "DOUBLESTAREQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "OP",
85 "<ERRORTOKEN>",
86 "<N_TOKENS>"
87};
88
89
90/* Create and initialize a new tok_state structure */
91
92static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000093tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000094{
Guido van Rossum86bea461997-04-29 21:03:06 +000095 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 if (tok == NULL)
97 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +000098 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099 tok->done = E_OK;
100 tok->fp = NULL;
101 tok->tabsize = TABSIZE;
102 tok->indent = 0;
103 tok->indstack[0] = 0;
104 tok->atbol = 1;
105 tok->pendin = 0;
106 tok->prompt = tok->nextprompt = NULL;
107 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000108 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000109 tok->filename = NULL;
110 tok->altwarning = 0;
111 tok->alterror = 0;
112 tok->alttabsize = 1;
113 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 return tok;
115}
116
117
118/* Set up tokenizer for string */
119
120struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
123 struct tok_state *tok = tok_new();
124 if (tok == NULL)
125 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000126 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000127 return tok;
128}
129
130
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000131/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000132
133struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000134PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135{
136 struct tok_state *tok = tok_new();
137 if (tok == NULL)
138 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000139 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
140 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000141 return NULL;
142 }
143 tok->cur = tok->inp = tok->buf;
144 tok->end = tok->buf + BUFSIZ;
145 tok->fp = fp;
146 tok->prompt = ps1;
147 tok->nextprompt = ps2;
148 return tok;
149}
150
151
152/* Free a tok_state structure */
153
154void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000155PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000157 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000158 PyMem_DEL(tok->buf);
159 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000160}
161
162
163/* Get next char, updating state; error code goes into tok->done */
164
165static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000166tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000167{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000168 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000169 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000170 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000171 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000172 if (tok->done != E_OK)
173 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000174 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000175 char *end = strchr(tok->inp, '\n');
176 if (end != NULL)
177 end++;
178 else {
179 end = strchr(tok->inp, '\0');
180 if (end == tok->inp) {
181 tok->done = E_EOF;
182 return EOF;
183 }
184 }
185 if (tok->start == NULL)
186 tok->buf = tok->cur;
187 tok->lineno++;
188 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000189 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000190 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000191 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000192 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 if (tok->nextprompt != NULL)
194 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000195 if (new == NULL)
196 tok->done = E_INTR;
197 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000198 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000199 tok->done = E_EOF;
200 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000201 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000202 size_t start = tok->start - tok->buf;
203 size_t oldlen = tok->cur - tok->buf;
204 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000205 char *buf = tok->buf;
206 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000207 tok->lineno++;
208 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000209 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000210 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000211 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000212 tok->done = E_NOMEM;
213 return EOF;
214 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000215 tok->buf = buf;
216 tok->cur = tok->buf + oldlen;
217 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000218 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000219 tok->inp = tok->buf + newlen;
220 tok->end = tok->inp + 1;
221 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000222 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000223 else {
224 tok->lineno++;
225 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000226 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000227 tok->buf = new;
228 tok->cur = tok->buf;
229 tok->inp = strchr(tok->buf, '\0');
230 tok->end = tok->inp + 1;
231 }
232 }
233 else {
234 int done = 0;
235 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000236 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000237 if (tok->start == NULL) {
238 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000239 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000240 if (tok->buf == NULL) {
241 tok->done = E_NOMEM;
242 return EOF;
243 }
244 tok->end = tok->buf + BUFSIZ;
245 }
246 if (fgets(tok->buf, (int)(tok->end - tok->buf),
247 tok->fp) == NULL) {
248 tok->done = E_EOF;
249 done = 1;
250 }
251 else {
252 tok->done = E_OK;
253 tok->inp = strchr(tok->buf, '\0');
254 done = tok->inp[-1] == '\n';
255 }
256 }
257 else {
258 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000259 if (feof(tok->fp)) {
260 tok->done = E_EOF;
261 done = 1;
262 }
263 else
264 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000265 }
266 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000267 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000268 while (!done) {
269 int curstart = tok->start == NULL ? -1 :
270 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000271 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000272 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000273 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000274 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000275 if (newbuf == NULL) {
276 tok->done = E_NOMEM;
277 tok->cur = tok->inp;
278 return EOF;
279 }
280 tok->buf = newbuf;
281 tok->inp = tok->buf + curvalid;
282 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000283 tok->start = curstart < 0 ? NULL :
284 tok->buf + curstart;
285 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000286 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000287 tok->fp) == NULL) {
288 /* Last line does not end in \n,
289 fake one */
290 strcpy(tok->inp, "\n");
291 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000292 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000293 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000294 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000295 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000296#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000297 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000298 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000299 pt = tok->inp - 2;
300 if (pt >= tok->buf && *pt == '\r') {
301 *pt++ = '\n';
302 *pt = '\0';
303 tok->inp = pt;
304 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000305#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000306 }
307 if (tok->done != E_OK) {
308 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000309 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000310 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000311 return EOF;
312 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000313 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000314 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000315}
316
317
318/* Back-up one character */
319
320static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000321tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000322{
323 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000324 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000325 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000326 if (*tok->cur != c)
327 *tok->cur = c;
328 }
329}
330
331
332/* Return the token corresponding to a single character */
333
334int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000335PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000336{
337 switch (c) {
338 case '(': return LPAR;
339 case ')': return RPAR;
340 case '[': return LSQB;
341 case ']': return RSQB;
342 case ':': return COLON;
343 case ',': return COMMA;
344 case ';': return SEMI;
345 case '+': return PLUS;
346 case '-': return MINUS;
347 case '*': return STAR;
348 case '/': return SLASH;
349 case '|': return VBAR;
350 case '&': return AMPER;
351 case '<': return LESS;
352 case '>': return GREATER;
353 case '=': return EQUAL;
354 case '.': return DOT;
355 case '%': return PERCENT;
356 case '`': return BACKQUOTE;
357 case '{': return LBRACE;
358 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000359 case '^': return CIRCUMFLEX;
360 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000361 default: return OP;
362 }
363}
364
365
Guido van Rossumfbab9051991-10-20 20:25:03 +0000366int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000367PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000368{
369 switch (c1) {
370 case '=':
371 switch (c2) {
372 case '=': return EQEQUAL;
373 }
374 break;
375 case '!':
376 switch (c2) {
377 case '=': return NOTEQUAL;
378 }
379 break;
380 case '<':
381 switch (c2) {
382 case '>': return NOTEQUAL;
383 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000384 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000385 }
386 break;
387 case '>':
388 switch (c2) {
389 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000390 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000391 }
392 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000393 case '+':
394 switch (c2) {
395 case '=': return PLUSEQUAL;
396 }
397 break;
398 case '-':
399 switch (c2) {
400 case '=': return MINEQUAL;
401 }
402 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000403 case '*':
404 switch (c2) {
405 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000406 case '=': return STAREQUAL;
407 }
408 break;
409 case '/':
410 switch (c2) {
411 case '=': return SLASHEQUAL;
412 }
413 break;
414 case '|':
415 switch (c2) {
416 case '=': return VBAREQUAL;
417 }
418 break;
419 case '%':
420 switch (c2) {
421 case '=': return PERCENTEQUAL;
422 }
423 break;
424 case '&':
425 switch (c2) {
426 case '=': return AMPEREQUAL;
427 }
428 break;
429 case '^':
430 switch (c2) {
431 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000432 }
433 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000434 }
435 return OP;
436}
437
Thomas Wouters434d0822000-08-24 20:11:32 +0000438int
439PyToken_ThreeChars(int c1, int c2, int c3)
440{
441 switch (c1) {
442 case '<':
443 switch (c2) {
444 case '<':
445 switch (c3) {
446 case '=':
447 return LEFTSHIFTEQUAL;
448 break;
449 }
450 break;
451 }
452 break;
453 case '>':
454 switch (c2) {
455 case '>':
456 switch (c3) {
457 case '=':
458 return RIGHTSHIFTEQUAL;
459 break;
460 }
461 break;
462 }
463 break;
464 case '*':
465 switch (c2) {
466 case '*':
467 switch (c3) {
468 case '=':
469 return DOUBLESTAREQUAL;
470 break;
471 }
472 break;
473 }
474 break;
475 }
476 return OP;
477}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000478
Guido van Rossum926f13a1998-04-09 21:38:06 +0000479static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000480indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000481{
482 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000483 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000484 tok->cur = tok->inp;
485 return 1;
486 }
487 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000488 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
489 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000490 tok->altwarning = 0;
491 }
492 return 0;
493}
494
495
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000496/* Get next token, after space stripping etc. */
497
498int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000499PyTokenizer_Get(register struct tok_state *tok, char **p_start,
500 char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000501{
502 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000503 int blankline;
504
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000505 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000506 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000507 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000508 blankline = 0;
509
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000510 /* Get indentation level */
511 if (tok->atbol) {
512 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000513 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000514 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000515 for (;;) {
516 c = tok_nextc(tok);
517 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000518 col++, altcol++;
519 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000520 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000521 altcol = (altcol/tok->alttabsize + 1)
522 * tok->alttabsize;
523 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000524 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000525 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000526 else
527 break;
528 }
529 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000530 if (c == '#' || c == '\n') {
531 /* Lines with only whitespace and/or comments
532 shouldn't affect the indentation and are
533 not passed to the parser as NEWLINE tokens,
534 except *totally* empty lines in interactive
535 mode, which signal the end of a command group. */
536 if (col == 0 && c == '\n' && tok->prompt != NULL)
537 blankline = 0; /* Let it through */
538 else
539 blankline = 1; /* Ignore completely */
540 /* We can't jump back right here since we still
541 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000542 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000543 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000544 if (col == tok->indstack[tok->indent]) {
545 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000546 if (altcol != tok->altindstack[tok->indent]) {
547 if (indenterror(tok))
548 return ERRORTOKEN;
549 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000550 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000551 else if (col > tok->indstack[tok->indent]) {
552 /* Indent -- always one */
553 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000554 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000555 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000556 return ERRORTOKEN;
557 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000558 if (altcol <= tok->altindstack[tok->indent]) {
559 if (indenterror(tok))
560 return ERRORTOKEN;
561 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000562 tok->pendin++;
563 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000564 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000565 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000566 else /* col < tok->indstack[tok->indent] */ {
567 /* Dedent -- any number, must be consistent */
568 while (tok->indent > 0 &&
569 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000570 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000571 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000572 }
573 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000574 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000575 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000576 return ERRORTOKEN;
577 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000578 if (altcol != tok->altindstack[tok->indent]) {
579 if (indenterror(tok))
580 return ERRORTOKEN;
581 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000582 }
583 }
584 }
585
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000586 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000587
588 /* Return pending indents/dedents */
589 if (tok->pendin != 0) {
590 if (tok->pendin < 0) {
591 tok->pendin++;
592 return DEDENT;
593 }
594 else {
595 tok->pendin--;
596 return INDENT;
597 }
598 }
599
600 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000601 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000602 /* Skip spaces */
603 do {
604 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000605 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606
607 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000608 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000609
Guido van Rossumab5ca152000-03-31 00:52:27 +0000610 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000611 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000612 static char *tabforms[] = {
613 "tab-width:", /* Emacs */
614 ":tabstop=", /* vim, full form */
615 ":ts=", /* vim, abbreviated form */
616 "set tabsize=", /* will vi never die? */
617 /* more templates can be added here to support other editors */
618 };
619 char cbuf[80];
620 char *tp, **cp;
621 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000623 *tp++ = c = tok_nextc(tok);
624 } while (c != EOF && c != '\n' &&
625 tp - cbuf + 1 < sizeof(cbuf));
626 *tp = '\0';
627 for (cp = tabforms;
628 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
629 cp++) {
630 if ((tp = strstr(cbuf, *cp))) {
631 int newsize = atoi(tp + strlen(*cp));
632
633 if (newsize >= 1 && newsize <= 40) {
634 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000635 if (Py_VerboseFlag)
636 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000637 "Tab size set to %d\n",
638 newsize);
639 }
640 }
641 }
642 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000643 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000644 }
645
646 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000647 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000649 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650
651 /* Identifier (most frequent token!) */
652 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000653 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000654 switch (c) {
655 case 'r':
656 case 'R':
657 c = tok_nextc(tok);
658 if (c == '"' || c == '\'')
659 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000660 break;
661 case 'u':
662 case 'U':
663 c = tok_nextc(tok);
664 if (c == 'r' || c == 'R')
665 c = tok_nextc(tok);
666 if (c == '"' || c == '\'')
667 goto letter_quote;
668 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000669 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000670 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000671 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000672 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000674 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675 *p_end = tok->cur;
676 return NAME;
677 }
678
679 /* Newline */
680 if (c == '\n') {
681 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000682 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000683 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000684 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000685 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
686 return NEWLINE;
687 }
688
Guido van Rossum2d45be11997-04-11 19:16:25 +0000689#ifdef macintosh
690 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000691 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000692 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000693 tok->done = E_TOKEN;
694 tok->cur = tok->inp;
695 return ERRORTOKEN;
696 }
697#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000698 /* Period or number starting with period? */
699 if (c == '.') {
700 c = tok_nextc(tok);
701 if (isdigit(c)) {
702 goto fraction;
703 }
704 else {
705 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000706 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000707 *p_end = tok->cur;
708 return DOT;
709 }
710 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000711
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712 /* Number */
713 if (isdigit(c)) {
714 if (c == '0') {
715 /* Hex or octal */
716 c = tok_nextc(tok);
717 if (c == '.')
718 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000719#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000720 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000721 goto imaginary;
722#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723 if (c == 'x' || c == 'X') {
724 /* Hex */
725 do {
726 c = tok_nextc(tok);
727 } while (isxdigit(c));
728 }
729 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000730 /* XXX This is broken! E.g.,
731 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732 /* Octal; c is first char of it */
733 /* There's no 'isoctdigit' macro, sigh */
734 while ('0' <= c && c < '8') {
735 c = tok_nextc(tok);
736 }
737 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000738 if (c == 'l' || c == 'L')
739 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740 }
741 else {
742 /* Decimal */
743 do {
744 c = tok_nextc(tok);
745 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000746 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000748 else {
749 /* Accept floating point numbers.
750 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000751 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000752 if (c == '.') {
753 fraction:
754 /* Fraction */
755 do {
756 c = tok_nextc(tok);
757 } while (isdigit(c));
758 }
759 if (c == 'e' || c == 'E') {
760 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000762 if (c == '+' || c == '-')
763 c = tok_nextc(tok);
764 while (isdigit(c)) {
765 c = tok_nextc(tok);
766 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000768#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000769 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000770 /* Imaginary part */
771 imaginary:
772 c = tok_nextc(tok);
773#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 }
775 }
776 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000777 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 *p_end = tok->cur;
779 return NUMBER;
780 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000781
782 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 /* String */
784 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000785 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000786 int quote = c;
787 int triple = 0;
788 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789 for (;;) {
790 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 if (c == '\n') {
792 if (!triple) {
793 tok->done = E_TOKEN;
794 tok_backup(tok, c);
795 return ERRORTOKEN;
796 }
797 tripcount = 0;
798 }
799 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000801 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802 return ERRORTOKEN;
803 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000804 else if (c == quote) {
805 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000806 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 c = tok_nextc(tok);
808 if (c == quote) {
809 triple = 1;
810 tripcount = 0;
811 continue;
812 }
813 tok_backup(tok, c);
814 }
815 if (!triple || tripcount == 3)
816 break;
817 }
818 else if (c == '\\') {
819 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000823 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824 return ERRORTOKEN;
825 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 else
828 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000831 *p_end = tok->cur;
832 return STRING;
833 }
834
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835 /* Line continuation */
836 if (c == '\\') {
837 c = tok_nextc(tok);
838 if (c != '\n') {
839 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000840 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000841 return ERRORTOKEN;
842 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843 goto again; /* Read next line */
844 }
845
Guido van Rossumfbab9051991-10-20 20:25:03 +0000846 /* Check for two-character token */
847 {
848 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000849 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000850 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +0000851 int c3 = tok_nextc(tok);
852 int token3 = PyToken_ThreeChars(c, c2, c3);
853 if (token3 != OP) {
854 token = token3;
855 } else {
856 tok_backup(tok, c3);
857 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000858 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000859 *p_end = tok->cur;
860 return token;
861 }
862 tok_backup(tok, c2);
863 }
864
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000866 switch (c) {
867 case '(':
868 case '[':
869 case '{':
870 tok->level++;
871 break;
872 case ')':
873 case ']':
874 case '}':
875 tok->level--;
876 break;
877 }
878
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000879 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000882 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883}
884
885
Guido van Rossum408027e1996-12-30 16:17:54 +0000886#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887
888void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000889tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000890{
Guido van Rossum86bea461997-04-29 21:03:06 +0000891 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000892 if (type == NAME || type == NUMBER || type == STRING || type == OP)
893 printf("(%.*s)", (int)(end - start), start);
894}
895
896#endif