blob: b783e807844cc495d165879208a063bfcef384c4 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Guido van Rossum3f5da241990-12-20 15:06:42 +00004#include "pgenheaders.h"
5
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00006#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00008#include "tokenizer.h"
9#include "errcode.h"
10
Tim Petersdbd9ba62000-07-09 03:09:57 +000011extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000012/* Return malloc'ed string including trailing \n;
13 empty malloc'ed string for EOF;
14 NULL if interrupted */
15
Guido van Rossum4fe87291992-02-26 15:24:44 +000016/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000018
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000019/* Convert a possibly signed character to a nonnegative int */
20/* XXX This assumes characters are 8 bits wide */
21#ifdef __CHAR_UNSIGNED__
22#define Py_CHARMASK(c) (c)
23#else
24#define Py_CHARMASK(c) ((c) & 0xff)
25#endif
26
Guido van Rossum3f5da241990-12-20 15:06:42 +000027/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000028static struct tok_state *tok_new(void);
29static int tok_nextc(struct tok_state *tok);
30static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000031
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032/* Token names */
33
Guido van Rossum86bea461997-04-29 21:03:06 +000034char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035 "ENDMARKER",
36 "NAME",
37 "NUMBER",
38 "STRING",
39 "NEWLINE",
40 "INDENT",
41 "DEDENT",
42 "LPAR",
43 "RPAR",
44 "LSQB",
45 "RSQB",
46 "COLON",
47 "COMMA",
48 "SEMI",
49 "PLUS",
50 "MINUS",
51 "STAR",
52 "SLASH",
53 "VBAR",
54 "AMPER",
55 "LESS",
56 "GREATER",
57 "EQUAL",
58 "DOT",
59 "PERCENT",
60 "BACKQUOTE",
61 "LBRACE",
62 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000063 "EQEQUAL",
64 "NOTEQUAL",
65 "LESSEQUAL",
66 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000067 "TILDE",
68 "CIRCUMFLEX",
69 "LEFTSHIFT",
70 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000071 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000072 "PLUSEQUAL",
73 "MINEQUAL",
74 "STAREQUAL",
75 "SLASHEQUAL",
76 "PERCENTEQUAL",
77 "AMPEREQUAL",
78 "VBAREQUAL",
79 "CIRCUMFLEXEQUAL",
80 "LEFTSHIFTEQUAL",
81 "RIGHTSHIFTEQUAL",
82 "DOUBLESTAREQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "OP",
85 "<ERRORTOKEN>",
86 "<N_TOKENS>"
87};
88
89
90/* Create and initialize a new tok_state structure */
91
92static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000093tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000094{
Guido van Rossum86bea461997-04-29 21:03:06 +000095 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 if (tok == NULL)
97 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +000098 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099 tok->done = E_OK;
100 tok->fp = NULL;
101 tok->tabsize = TABSIZE;
102 tok->indent = 0;
103 tok->indstack[0] = 0;
104 tok->atbol = 1;
105 tok->pendin = 0;
106 tok->prompt = tok->nextprompt = NULL;
107 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000108 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000109 tok->filename = NULL;
110 tok->altwarning = 0;
111 tok->alterror = 0;
112 tok->alttabsize = 1;
113 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 return tok;
115}
116
117
118/* Set up tokenizer for string */
119
120struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
123 struct tok_state *tok = tok_new();
124 if (tok == NULL)
125 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000126 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000127 return tok;
128}
129
130
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000131/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000132
133struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000134PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135{
136 struct tok_state *tok = tok_new();
137 if (tok == NULL)
138 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000139 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
140 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000141 return NULL;
142 }
143 tok->cur = tok->inp = tok->buf;
144 tok->end = tok->buf + BUFSIZ;
145 tok->fp = fp;
146 tok->prompt = ps1;
147 tok->nextprompt = ps2;
148 return tok;
149}
150
151
152/* Free a tok_state structure */
153
154void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000155PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000157 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000158 PyMem_DEL(tok->buf);
159 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000160}
161
162
163/* Get next char, updating state; error code goes into tok->done */
164
165static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000166tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000167{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000168 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000169 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000170 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000171 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000172 if (tok->done != E_OK)
173 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000174 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000175 char *end = strchr(tok->inp, '\n');
176 if (end != NULL)
177 end++;
178 else {
179 end = strchr(tok->inp, '\0');
180 if (end == tok->inp) {
181 tok->done = E_EOF;
182 return EOF;
183 }
184 }
185 if (tok->start == NULL)
186 tok->buf = tok->cur;
187 tok->lineno++;
188 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000189 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000190 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000191 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000192 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 if (tok->nextprompt != NULL)
194 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000195 if (new == NULL)
196 tok->done = E_INTR;
197 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000198 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000199 tok->done = E_EOF;
200 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000201 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000202 size_t start = tok->start - tok->buf;
203 size_t oldlen = tok->cur - tok->buf;
204 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000205 char *buf = tok->buf;
206 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000207 tok->lineno++;
208 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000209 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000210 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000211 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000212 tok->done = E_NOMEM;
213 return EOF;
214 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000215 tok->buf = buf;
216 tok->cur = tok->buf + oldlen;
217 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000218 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000219 tok->inp = tok->buf + newlen;
220 tok->end = tok->inp + 1;
221 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000222 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000223 else {
224 tok->lineno++;
225 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000226 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000227 tok->buf = new;
228 tok->cur = tok->buf;
229 tok->inp = strchr(tok->buf, '\0');
230 tok->end = tok->inp + 1;
231 }
232 }
233 else {
234 int done = 0;
235 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000236 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000237 if (tok->start == NULL) {
238 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000239 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000240 if (tok->buf == NULL) {
241 tok->done = E_NOMEM;
242 return EOF;
243 }
244 tok->end = tok->buf + BUFSIZ;
245 }
246 if (fgets(tok->buf, (int)(tok->end - tok->buf),
247 tok->fp) == NULL) {
248 tok->done = E_EOF;
249 done = 1;
250 }
251 else {
252 tok->done = E_OK;
253 tok->inp = strchr(tok->buf, '\0');
254 done = tok->inp[-1] == '\n';
255 }
256 }
257 else {
258 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000259 if (feof(tok->fp)) {
260 tok->done = E_EOF;
261 done = 1;
262 }
263 else
264 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000265 }
266 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000267 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000268 while (!done) {
269 int curstart = tok->start == NULL ? -1 :
270 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000271 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000272 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000273 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000274 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000275 if (newbuf == NULL) {
276 tok->done = E_NOMEM;
277 tok->cur = tok->inp;
278 return EOF;
279 }
280 tok->buf = newbuf;
281 tok->inp = tok->buf + curvalid;
282 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000283 tok->start = curstart < 0 ? NULL :
284 tok->buf + curstart;
285 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000286 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000287 tok->fp) == NULL) {
288 /* Last line does not end in \n,
289 fake one */
290 strcpy(tok->inp, "\n");
291 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000292 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000293 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000294 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000295 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000296#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000297 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000298 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000299 pt = tok->inp - 2;
300 if (pt >= tok->buf && *pt == '\r') {
301 *pt++ = '\n';
302 *pt = '\0';
303 tok->inp = pt;
304 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000305#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000306 }
307 if (tok->done != E_OK) {
308 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000309 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000310 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000311 return EOF;
312 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000313 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000314 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000315}
316
317
318/* Back-up one character */
319
320static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000321tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000322{
323 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000324 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000325 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000326 if (*tok->cur != c)
327 *tok->cur = c;
328 }
329}
330
331
332/* Return the token corresponding to a single character */
333
334int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000335PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000336{
337 switch (c) {
338 case '(': return LPAR;
339 case ')': return RPAR;
340 case '[': return LSQB;
341 case ']': return RSQB;
342 case ':': return COLON;
343 case ',': return COMMA;
344 case ';': return SEMI;
345 case '+': return PLUS;
346 case '-': return MINUS;
347 case '*': return STAR;
348 case '/': return SLASH;
349 case '|': return VBAR;
350 case '&': return AMPER;
351 case '<': return LESS;
352 case '>': return GREATER;
353 case '=': return EQUAL;
354 case '.': return DOT;
355 case '%': return PERCENT;
356 case '`': return BACKQUOTE;
357 case '{': return LBRACE;
358 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000359 case '^': return CIRCUMFLEX;
360 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000361 default: return OP;
362 }
363}
364
365
Guido van Rossumfbab9051991-10-20 20:25:03 +0000366int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000367PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000368{
369 switch (c1) {
370 case '=':
371 switch (c2) {
372 case '=': return EQEQUAL;
373 }
374 break;
375 case '!':
376 switch (c2) {
377 case '=': return NOTEQUAL;
378 }
379 break;
380 case '<':
381 switch (c2) {
382 case '>': return NOTEQUAL;
383 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000384 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000385 }
386 break;
387 case '>':
388 switch (c2) {
389 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000390 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000391 }
392 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000393 case '+':
394 switch (c2) {
395 case '=': return PLUSEQUAL;
396 }
397 break;
398 case '-':
399 switch (c2) {
400 case '=': return MINEQUAL;
401 }
402 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000403 case '*':
404 switch (c2) {
405 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000406 case '=': return STAREQUAL;
407 }
408 break;
409 case '/':
410 switch (c2) {
411 case '=': return SLASHEQUAL;
412 }
413 break;
414 case '|':
415 switch (c2) {
416 case '=': return VBAREQUAL;
417 }
418 break;
419 case '%':
420 switch (c2) {
421 case '=': return PERCENTEQUAL;
422 }
423 break;
424 case '&':
425 switch (c2) {
426 case '=': return AMPEREQUAL;
427 }
428 break;
429 case '^':
430 switch (c2) {
431 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000432 }
433 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000434 }
435 return OP;
436}
437
Thomas Wouters434d0822000-08-24 20:11:32 +0000438int
439PyToken_ThreeChars(int c1, int c2, int c3)
440{
441 switch (c1) {
442 case '<':
443 switch (c2) {
444 case '<':
445 switch (c3) {
446 case '=':
447 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000448 }
449 break;
450 }
451 break;
452 case '>':
453 switch (c2) {
454 case '>':
455 switch (c3) {
456 case '=':
457 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000458 }
459 break;
460 }
461 break;
462 case '*':
463 switch (c2) {
464 case '*':
465 switch (c3) {
466 case '=':
467 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000468 }
469 break;
470 }
471 break;
472 }
473 return OP;
474}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000475
Guido van Rossum926f13a1998-04-09 21:38:06 +0000476static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000477indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000478{
479 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000480 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000481 tok->cur = tok->inp;
482 return 1;
483 }
484 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000485 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
486 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000487 tok->altwarning = 0;
488 }
489 return 0;
490}
491
492
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000493/* Get next token, after space stripping etc. */
494
495int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000496PyTokenizer_Get(register struct tok_state *tok, char **p_start,
497 char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000498{
499 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000500 int blankline;
501
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000502 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000503 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000504 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000505 blankline = 0;
506
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000507 /* Get indentation level */
508 if (tok->atbol) {
509 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000510 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000511 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000512 for (;;) {
513 c = tok_nextc(tok);
514 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000515 col++, altcol++;
516 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000517 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000518 altcol = (altcol/tok->alttabsize + 1)
519 * tok->alttabsize;
520 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000521 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000522 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000523 else
524 break;
525 }
526 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000527 if (c == '#' || c == '\n') {
528 /* Lines with only whitespace and/or comments
529 shouldn't affect the indentation and are
530 not passed to the parser as NEWLINE tokens,
531 except *totally* empty lines in interactive
532 mode, which signal the end of a command group. */
533 if (col == 0 && c == '\n' && tok->prompt != NULL)
534 blankline = 0; /* Let it through */
535 else
536 blankline = 1; /* Ignore completely */
537 /* We can't jump back right here since we still
538 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000539 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000540 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000541 if (col == tok->indstack[tok->indent]) {
542 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000543 if (altcol != tok->altindstack[tok->indent]) {
544 if (indenterror(tok))
545 return ERRORTOKEN;
546 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000547 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000548 else if (col > tok->indstack[tok->indent]) {
549 /* Indent -- always one */
550 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000551 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000552 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000553 return ERRORTOKEN;
554 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000555 if (altcol <= tok->altindstack[tok->indent]) {
556 if (indenterror(tok))
557 return ERRORTOKEN;
558 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000559 tok->pendin++;
560 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000561 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000562 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000563 else /* col < tok->indstack[tok->indent] */ {
564 /* Dedent -- any number, must be consistent */
565 while (tok->indent > 0 &&
566 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000567 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000568 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000569 }
570 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000571 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000572 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000573 return ERRORTOKEN;
574 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000575 if (altcol != tok->altindstack[tok->indent]) {
576 if (indenterror(tok))
577 return ERRORTOKEN;
578 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000579 }
580 }
581 }
582
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000583 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000584
585 /* Return pending indents/dedents */
586 if (tok->pendin != 0) {
587 if (tok->pendin < 0) {
588 tok->pendin++;
589 return DEDENT;
590 }
591 else {
592 tok->pendin--;
593 return INDENT;
594 }
595 }
596
597 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000598 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000599 /* Skip spaces */
600 do {
601 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000602 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603
604 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000605 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606
Guido van Rossumab5ca152000-03-31 00:52:27 +0000607 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000609 static char *tabforms[] = {
610 "tab-width:", /* Emacs */
611 ":tabstop=", /* vim, full form */
612 ":ts=", /* vim, abbreviated form */
613 "set tabsize=", /* will vi never die? */
614 /* more templates can be added here to support other editors */
615 };
616 char cbuf[80];
617 char *tp, **cp;
618 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000620 *tp++ = c = tok_nextc(tok);
621 } while (c != EOF && c != '\n' &&
622 tp - cbuf + 1 < sizeof(cbuf));
623 *tp = '\0';
624 for (cp = tabforms;
625 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
626 cp++) {
627 if ((tp = strstr(cbuf, *cp))) {
628 int newsize = atoi(tp + strlen(*cp));
629
630 if (newsize >= 1 && newsize <= 40) {
631 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000632 if (Py_VerboseFlag)
633 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000634 "Tab size set to %d\n",
635 newsize);
636 }
637 }
638 }
639 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000640 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000641 }
642
643 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000644 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000646 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647
648 /* Identifier (most frequent token!) */
649 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000650 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000651 switch (c) {
652 case 'r':
653 case 'R':
654 c = tok_nextc(tok);
655 if (c == '"' || c == '\'')
656 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000657 break;
658 case 'u':
659 case 'U':
660 c = tok_nextc(tok);
661 if (c == 'r' || c == 'R')
662 c = tok_nextc(tok);
663 if (c == '"' || c == '\'')
664 goto letter_quote;
665 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000666 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000667 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000668 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000669 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000670 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000671 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672 *p_end = tok->cur;
673 return NAME;
674 }
675
676 /* Newline */
677 if (c == '\n') {
678 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000679 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000680 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000681 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
683 return NEWLINE;
684 }
685
Guido van Rossum2d45be11997-04-11 19:16:25 +0000686#ifdef macintosh
687 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000688 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000689 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000690 tok->done = E_TOKEN;
691 tok->cur = tok->inp;
692 return ERRORTOKEN;
693 }
694#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000695 /* Period or number starting with period? */
696 if (c == '.') {
697 c = tok_nextc(tok);
698 if (isdigit(c)) {
699 goto fraction;
700 }
701 else {
702 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000703 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000704 *p_end = tok->cur;
705 return DOT;
706 }
707 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000708
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000709 /* Number */
710 if (isdigit(c)) {
711 if (c == '0') {
712 /* Hex or octal */
713 c = tok_nextc(tok);
714 if (c == '.')
715 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000716#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000717 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000718 goto imaginary;
719#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720 if (c == 'x' || c == 'X') {
721 /* Hex */
722 do {
723 c = tok_nextc(tok);
724 } while (isxdigit(c));
725 }
726 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000727 /* XXX This is broken! E.g.,
728 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729 /* Octal; c is first char of it */
730 /* There's no 'isoctdigit' macro, sigh */
731 while ('0' <= c && c < '8') {
732 c = tok_nextc(tok);
733 }
734 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000735 if (c == 'l' || c == 'L')
736 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 }
738 else {
739 /* Decimal */
740 do {
741 c = tok_nextc(tok);
742 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000743 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000745 else {
746 /* Accept floating point numbers.
747 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000748 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000749 if (c == '.') {
750 fraction:
751 /* Fraction */
752 do {
753 c = tok_nextc(tok);
754 } while (isdigit(c));
755 }
756 if (c == 'e' || c == 'E') {
757 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000759 if (c == '+' || c == '-')
760 c = tok_nextc(tok);
761 while (isdigit(c)) {
762 c = tok_nextc(tok);
763 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000765#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000766 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000767 /* Imaginary part */
768 imaginary:
769 c = tok_nextc(tok);
770#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000771 }
772 }
773 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000774 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775 *p_end = tok->cur;
776 return NUMBER;
777 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000778
779 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000780 /* String */
781 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000782 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 int quote = c;
784 int triple = 0;
785 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000786 for (;;) {
787 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000788 if (c == '\n') {
789 if (!triple) {
790 tok->done = E_TOKEN;
791 tok_backup(tok, c);
792 return ERRORTOKEN;
793 }
794 tripcount = 0;
795 }
796 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000798 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799 return ERRORTOKEN;
800 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000801 else if (c == quote) {
802 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000803 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000804 c = tok_nextc(tok);
805 if (c == quote) {
806 triple = 1;
807 tripcount = 0;
808 continue;
809 }
810 tok_backup(tok, c);
811 }
812 if (!triple || tripcount == 3)
813 break;
814 }
815 else if (c == '\\') {
816 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000819 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000820 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821 return ERRORTOKEN;
822 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 else
825 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000828 *p_end = tok->cur;
829 return STRING;
830 }
831
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000832 /* Line continuation */
833 if (c == '\\') {
834 c = tok_nextc(tok);
835 if (c != '\n') {
836 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000837 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838 return ERRORTOKEN;
839 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000840 goto again; /* Read next line */
841 }
842
Guido van Rossumfbab9051991-10-20 20:25:03 +0000843 /* Check for two-character token */
844 {
845 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000846 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000847 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +0000848 int c3 = tok_nextc(tok);
849 int token3 = PyToken_ThreeChars(c, c2, c3);
850 if (token3 != OP) {
851 token = token3;
852 } else {
853 tok_backup(tok, c3);
854 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000855 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000856 *p_end = tok->cur;
857 return token;
858 }
859 tok_backup(tok, c2);
860 }
861
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000863 switch (c) {
864 case '(':
865 case '[':
866 case '{':
867 tok->level++;
868 break;
869 case ')':
870 case ']':
871 case '}':
872 tok->level--;
873 break;
874 }
875
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000876 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000877 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000878 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000879 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000880}
881
882
Guido van Rossum408027e1996-12-30 16:17:54 +0000883#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000884
885void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000886tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887{
Guido van Rossum86bea461997-04-29 21:03:06 +0000888 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889 if (type == NAME || type == NUMBER || type == STRING || type == OP)
890 printf("(%.*s)", (int)(end - start), start);
891}
892
893#endif