blob: eb84d144768e46727f0eb9374927f0e628fc0d55 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00002Copyright (c) 2000, BeOpen.com.
3Copyright (c) 1995-2000, Corporation for National Research Initiatives.
4Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
5All rights reserved.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00006
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00007See the file "Misc/COPYRIGHT" for information on usage and
8redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00009******************************************************************/
10
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000011/* Tokenizer implementation */
12
Guido van Rossum3f5da241990-12-20 15:06:42 +000013#include "pgenheaders.h"
14
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000015#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000016
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#include "tokenizer.h"
18#include "errcode.h"
19
Tim Petersdbd9ba62000-07-09 03:09:57 +000020extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000021/* Return malloc'ed string including trailing \n;
22 empty malloc'ed string for EOF;
23 NULL if interrupted */
24
Guido van Rossum4fe87291992-02-26 15:24:44 +000025/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000026#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000028/* Convert a possibly signed character to a nonnegative int */
29/* XXX This assumes characters are 8 bits wide */
30#ifdef __CHAR_UNSIGNED__
31#define Py_CHARMASK(c) (c)
32#else
33#define Py_CHARMASK(c) ((c) & 0xff)
34#endif
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000037static struct tok_state *tok_new(void);
38static int tok_nextc(struct tok_state *tok);
39static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000040
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041/* Token names */
42
Guido van Rossum86bea461997-04-29 21:03:06 +000043char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044 "ENDMARKER",
45 "NAME",
46 "NUMBER",
47 "STRING",
48 "NEWLINE",
49 "INDENT",
50 "DEDENT",
51 "LPAR",
52 "RPAR",
53 "LSQB",
54 "RSQB",
55 "COLON",
56 "COMMA",
57 "SEMI",
58 "PLUS",
59 "MINUS",
60 "STAR",
61 "SLASH",
62 "VBAR",
63 "AMPER",
64 "LESS",
65 "GREATER",
66 "EQUAL",
67 "DOT",
68 "PERCENT",
69 "BACKQUOTE",
70 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000092 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093 "OP",
94 "<ERRORTOKEN>",
95 "<N_TOKENS>"
96};
97
98
99/* Create and initialize a new tok_state structure */
100
101static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000102tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103{
Guido van Rossum86bea461997-04-29 21:03:06 +0000104 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000105 if (tok == NULL)
106 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000107 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 tok->done = E_OK;
109 tok->fp = NULL;
110 tok->tabsize = TABSIZE;
111 tok->indent = 0;
112 tok->indstack[0] = 0;
113 tok->atbol = 1;
114 tok->pendin = 0;
115 tok->prompt = tok->nextprompt = NULL;
116 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000117 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000118 tok->filename = NULL;
119 tok->altwarning = 0;
120 tok->alterror = 0;
121 tok->alttabsize = 1;
122 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 return tok;
124}
125
126
127/* Set up tokenizer for string */
128
129struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000130PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131{
132 struct tok_state *tok = tok_new();
133 if (tok == NULL)
134 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000135 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
139
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000140/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000141
142struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000143PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000144{
145 struct tok_state *tok = tok_new();
146 if (tok == NULL)
147 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000148 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
149 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000150 return NULL;
151 }
152 tok->cur = tok->inp = tok->buf;
153 tok->end = tok->buf + BUFSIZ;
154 tok->fp = fp;
155 tok->prompt = ps1;
156 tok->nextprompt = ps2;
157 return tok;
158}
159
160
161/* Free a tok_state structure */
162
163void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000164PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000165{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000166 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000167 PyMem_DEL(tok->buf);
168 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000169}
170
171
172/* Get next char, updating state; error code goes into tok->done */
173
174static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000175tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000176{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000179 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000180 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000181 if (tok->done != E_OK)
182 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000183 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000184 char *end = strchr(tok->inp, '\n');
185 if (end != NULL)
186 end++;
187 else {
188 end = strchr(tok->inp, '\0');
189 if (end == tok->inp) {
190 tok->done = E_EOF;
191 return EOF;
192 }
193 }
194 if (tok->start == NULL)
195 tok->buf = tok->cur;
196 tok->lineno++;
197 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000198 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000199 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000200 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000201 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 if (tok->nextprompt != NULL)
203 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 if (new == NULL)
205 tok->done = E_INTR;
206 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000207 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000208 tok->done = E_EOF;
209 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000210 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000211 size_t start = tok->start - tok->buf;
212 size_t oldlen = tok->cur - tok->buf;
213 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000214 char *buf = tok->buf;
215 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000216 tok->lineno++;
217 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000218 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000219 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000221 tok->done = E_NOMEM;
222 return EOF;
223 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000224 tok->buf = buf;
225 tok->cur = tok->buf + oldlen;
226 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000227 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000228 tok->inp = tok->buf + newlen;
229 tok->end = tok->inp + 1;
230 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000231 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000232 else {
233 tok->lineno++;
234 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000235 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000236 tok->buf = new;
237 tok->cur = tok->buf;
238 tok->inp = strchr(tok->buf, '\0');
239 tok->end = tok->inp + 1;
240 }
241 }
242 else {
243 int done = 0;
244 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000245 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000246 if (tok->start == NULL) {
247 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000248 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000249 if (tok->buf == NULL) {
250 tok->done = E_NOMEM;
251 return EOF;
252 }
253 tok->end = tok->buf + BUFSIZ;
254 }
255 if (fgets(tok->buf, (int)(tok->end - tok->buf),
256 tok->fp) == NULL) {
257 tok->done = E_EOF;
258 done = 1;
259 }
260 else {
261 tok->done = E_OK;
262 tok->inp = strchr(tok->buf, '\0');
263 done = tok->inp[-1] == '\n';
264 }
265 }
266 else {
267 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000268 if (feof(tok->fp)) {
269 tok->done = E_EOF;
270 done = 1;
271 }
272 else
273 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000274 }
275 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000276 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000277 while (!done) {
278 int curstart = tok->start == NULL ? -1 :
279 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000280 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000281 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000282 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000283 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000284 if (newbuf == NULL) {
285 tok->done = E_NOMEM;
286 tok->cur = tok->inp;
287 return EOF;
288 }
289 tok->buf = newbuf;
290 tok->inp = tok->buf + curvalid;
291 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000292 tok->start = curstart < 0 ? NULL :
293 tok->buf + curstart;
294 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000295 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000296 tok->fp) == NULL) {
297 /* Last line does not end in \n,
298 fake one */
299 strcpy(tok->inp, "\n");
300 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000301 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000302 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000303 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000304 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000305#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000306 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000307 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000308 pt = tok->inp - 2;
309 if (pt >= tok->buf && *pt == '\r') {
310 *pt++ = '\n';
311 *pt = '\0';
312 tok->inp = pt;
313 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000314#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000315 }
316 if (tok->done != E_OK) {
317 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000318 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000319 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000320 return EOF;
321 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000322 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000323 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000324}
325
326
327/* Back-up one character */
328
329static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000330tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000331{
332 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000333 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000334 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000335 if (*tok->cur != c)
336 *tok->cur = c;
337 }
338}
339
340
341/* Return the token corresponding to a single character */
342
343int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000344PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000345{
346 switch (c) {
347 case '(': return LPAR;
348 case ')': return RPAR;
349 case '[': return LSQB;
350 case ']': return RSQB;
351 case ':': return COLON;
352 case ',': return COMMA;
353 case ';': return SEMI;
354 case '+': return PLUS;
355 case '-': return MINUS;
356 case '*': return STAR;
357 case '/': return SLASH;
358 case '|': return VBAR;
359 case '&': return AMPER;
360 case '<': return LESS;
361 case '>': return GREATER;
362 case '=': return EQUAL;
363 case '.': return DOT;
364 case '%': return PERCENT;
365 case '`': return BACKQUOTE;
366 case '{': return LBRACE;
367 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000368 case '^': return CIRCUMFLEX;
369 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000370 default: return OP;
371 }
372}
373
374
Guido van Rossumfbab9051991-10-20 20:25:03 +0000375int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000376PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000377{
378 switch (c1) {
379 case '=':
380 switch (c2) {
381 case '=': return EQEQUAL;
382 }
383 break;
384 case '!':
385 switch (c2) {
386 case '=': return NOTEQUAL;
387 }
388 break;
389 case '<':
390 switch (c2) {
391 case '>': return NOTEQUAL;
392 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000393 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000394 }
395 break;
396 case '>':
397 switch (c2) {
398 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000399 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000400 }
401 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000402 case '+':
403 switch (c2) {
404 case '=': return PLUSEQUAL;
405 }
406 break;
407 case '-':
408 switch (c2) {
409 case '=': return MINEQUAL;
410 }
411 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000412 case '*':
413 switch (c2) {
414 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000415 case '=': return STAREQUAL;
416 }
417 break;
418 case '/':
419 switch (c2) {
420 case '=': return SLASHEQUAL;
421 }
422 break;
423 case '|':
424 switch (c2) {
425 case '=': return VBAREQUAL;
426 }
427 break;
428 case '%':
429 switch (c2) {
430 case '=': return PERCENTEQUAL;
431 }
432 break;
433 case '&':
434 switch (c2) {
435 case '=': return AMPEREQUAL;
436 }
437 break;
438 case '^':
439 switch (c2) {
440 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000441 }
442 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000443 }
444 return OP;
445}
446
Thomas Wouters434d0822000-08-24 20:11:32 +0000447int
448PyToken_ThreeChars(int c1, int c2, int c3)
449{
450 switch (c1) {
451 case '<':
452 switch (c2) {
453 case '<':
454 switch (c3) {
455 case '=':
456 return LEFTSHIFTEQUAL;
457 break;
458 }
459 break;
460 }
461 break;
462 case '>':
463 switch (c2) {
464 case '>':
465 switch (c3) {
466 case '=':
467 return RIGHTSHIFTEQUAL;
468 break;
469 }
470 break;
471 }
472 break;
473 case '*':
474 switch (c2) {
475 case '*':
476 switch (c3) {
477 case '=':
478 return DOUBLESTAREQUAL;
479 break;
480 }
481 break;
482 }
483 break;
484 }
485 return OP;
486}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000487
Guido van Rossum926f13a1998-04-09 21:38:06 +0000488static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000489indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000490{
491 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000492 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000493 tok->cur = tok->inp;
494 return 1;
495 }
496 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000497 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
498 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000499 tok->altwarning = 0;
500 }
501 return 0;
502}
503
504
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000505/* Get next token, after space stripping etc. */
506
507int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000508PyTokenizer_Get(register struct tok_state *tok, char **p_start,
509 char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000510{
511 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000512 int blankline;
513
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000514 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000515 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000516 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000517 blankline = 0;
518
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000519 /* Get indentation level */
520 if (tok->atbol) {
521 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000522 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000523 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000524 for (;;) {
525 c = tok_nextc(tok);
526 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000527 col++, altcol++;
528 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000529 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000530 altcol = (altcol/tok->alttabsize + 1)
531 * tok->alttabsize;
532 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000533 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000534 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000535 else
536 break;
537 }
538 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000539 if (c == '#' || c == '\n') {
540 /* Lines with only whitespace and/or comments
541 shouldn't affect the indentation and are
542 not passed to the parser as NEWLINE tokens,
543 except *totally* empty lines in interactive
544 mode, which signal the end of a command group. */
545 if (col == 0 && c == '\n' && tok->prompt != NULL)
546 blankline = 0; /* Let it through */
547 else
548 blankline = 1; /* Ignore completely */
549 /* We can't jump back right here since we still
550 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000551 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000552 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000553 if (col == tok->indstack[tok->indent]) {
554 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000555 if (altcol != tok->altindstack[tok->indent]) {
556 if (indenterror(tok))
557 return ERRORTOKEN;
558 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000559 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000560 else if (col > tok->indstack[tok->indent]) {
561 /* Indent -- always one */
562 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000563 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000564 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000565 return ERRORTOKEN;
566 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000567 if (altcol <= tok->altindstack[tok->indent]) {
568 if (indenterror(tok))
569 return ERRORTOKEN;
570 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000571 tok->pendin++;
572 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000573 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000574 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000575 else /* col < tok->indstack[tok->indent] */ {
576 /* Dedent -- any number, must be consistent */
577 while (tok->indent > 0 &&
578 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000579 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000580 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000581 }
582 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000583 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000584 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000585 return ERRORTOKEN;
586 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000587 if (altcol != tok->altindstack[tok->indent]) {
588 if (indenterror(tok))
589 return ERRORTOKEN;
590 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000591 }
592 }
593 }
594
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000595 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000596
597 /* Return pending indents/dedents */
598 if (tok->pendin != 0) {
599 if (tok->pendin < 0) {
600 tok->pendin++;
601 return DEDENT;
602 }
603 else {
604 tok->pendin--;
605 return INDENT;
606 }
607 }
608
609 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000610 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000611 /* Skip spaces */
612 do {
613 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000614 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000615
616 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000617 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000618
Guido van Rossumab5ca152000-03-31 00:52:27 +0000619 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000620 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000621 static char *tabforms[] = {
622 "tab-width:", /* Emacs */
623 ":tabstop=", /* vim, full form */
624 ":ts=", /* vim, abbreviated form */
625 "set tabsize=", /* will vi never die? */
626 /* more templates can be added here to support other editors */
627 };
628 char cbuf[80];
629 char *tp, **cp;
630 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000632 *tp++ = c = tok_nextc(tok);
633 } while (c != EOF && c != '\n' &&
634 tp - cbuf + 1 < sizeof(cbuf));
635 *tp = '\0';
636 for (cp = tabforms;
637 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
638 cp++) {
639 if ((tp = strstr(cbuf, *cp))) {
640 int newsize = atoi(tp + strlen(*cp));
641
642 if (newsize >= 1 && newsize <= 40) {
643 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000644 if (Py_VerboseFlag)
645 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000646 "Tab size set to %d\n",
647 newsize);
648 }
649 }
650 }
651 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653 }
654
655 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000656 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000658 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659
660 /* Identifier (most frequent token!) */
661 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000662 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000663 switch (c) {
664 case 'r':
665 case 'R':
666 c = tok_nextc(tok);
667 if (c == '"' || c == '\'')
668 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000669 break;
670 case 'u':
671 case 'U':
672 c = tok_nextc(tok);
673 if (c == 'r' || c == 'R')
674 c = tok_nextc(tok);
675 if (c == '"' || c == '\'')
676 goto letter_quote;
677 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000678 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000679 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000680 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000681 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000683 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 *p_end = tok->cur;
685 return NAME;
686 }
687
688 /* Newline */
689 if (c == '\n') {
690 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000691 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000692 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000693 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000694 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
695 return NEWLINE;
696 }
697
Guido van Rossum2d45be11997-04-11 19:16:25 +0000698#ifdef macintosh
699 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000700 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000701 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000702 tok->done = E_TOKEN;
703 tok->cur = tok->inp;
704 return ERRORTOKEN;
705 }
706#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000707 /* Period or number starting with period? */
708 if (c == '.') {
709 c = tok_nextc(tok);
710 if (isdigit(c)) {
711 goto fraction;
712 }
713 else {
714 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000715 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000716 *p_end = tok->cur;
717 return DOT;
718 }
719 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000720
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000721 /* Number */
722 if (isdigit(c)) {
723 if (c == '0') {
724 /* Hex or octal */
725 c = tok_nextc(tok);
726 if (c == '.')
727 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000728#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000729 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000730 goto imaginary;
731#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732 if (c == 'x' || c == 'X') {
733 /* Hex */
734 do {
735 c = tok_nextc(tok);
736 } while (isxdigit(c));
737 }
738 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000739 /* XXX This is broken! E.g.,
740 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741 /* Octal; c is first char of it */
742 /* There's no 'isoctdigit' macro, sigh */
743 while ('0' <= c && c < '8') {
744 c = tok_nextc(tok);
745 }
746 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000747 if (c == 'l' || c == 'L')
748 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749 }
750 else {
751 /* Decimal */
752 do {
753 c = tok_nextc(tok);
754 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000755 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000757 else {
758 /* Accept floating point numbers.
759 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000760 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000761 if (c == '.') {
762 fraction:
763 /* Fraction */
764 do {
765 c = tok_nextc(tok);
766 } while (isdigit(c));
767 }
768 if (c == 'e' || c == 'E') {
769 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000771 if (c == '+' || c == '-')
772 c = tok_nextc(tok);
773 while (isdigit(c)) {
774 c = tok_nextc(tok);
775 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000777#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000778 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000779 /* Imaginary part */
780 imaginary:
781 c = tok_nextc(tok);
782#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 }
784 }
785 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000786 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787 *p_end = tok->cur;
788 return NUMBER;
789 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000790
791 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000792 /* String */
793 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000794 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000795 int quote = c;
796 int triple = 0;
797 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798 for (;;) {
799 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000800 if (c == '\n') {
801 if (!triple) {
802 tok->done = E_TOKEN;
803 tok_backup(tok, c);
804 return ERRORTOKEN;
805 }
806 tripcount = 0;
807 }
808 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000810 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 return ERRORTOKEN;
812 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000813 else if (c == quote) {
814 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000815 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 c = tok_nextc(tok);
817 if (c == quote) {
818 triple = 1;
819 tripcount = 0;
820 continue;
821 }
822 tok_backup(tok, c);
823 }
824 if (!triple || tripcount == 3)
825 break;
826 }
827 else if (c == '\\') {
828 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000831 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000832 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000833 return ERRORTOKEN;
834 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 else
837 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000840 *p_end = tok->cur;
841 return STRING;
842 }
843
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000844 /* Line continuation */
845 if (c == '\\') {
846 c = tok_nextc(tok);
847 if (c != '\n') {
848 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000849 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000850 return ERRORTOKEN;
851 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852 goto again; /* Read next line */
853 }
854
Guido van Rossumfbab9051991-10-20 20:25:03 +0000855 /* Check for two-character token */
856 {
857 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000858 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000859 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +0000860 int c3 = tok_nextc(tok);
861 int token3 = PyToken_ThreeChars(c, c2, c3);
862 if (token3 != OP) {
863 token = token3;
864 } else {
865 tok_backup(tok, c3);
866 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000867 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000868 *p_end = tok->cur;
869 return token;
870 }
871 tok_backup(tok, c2);
872 }
873
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000875 switch (c) {
876 case '(':
877 case '[':
878 case '{':
879 tok->level++;
880 break;
881 case ')':
882 case ']':
883 case '}':
884 tok->level--;
885 break;
886 }
887
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000888 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000890 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000891 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000892}
893
894
Guido van Rossum408027e1996-12-30 16:17:54 +0000895#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896
897void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000898tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000899{
Guido van Rossum86bea461997-04-29 21:03:06 +0000900 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000901 if (type == NAME || type == NUMBER || type == STRING || type == OP)
902 printf("(%.*s)", (int)(end - start), start);
903}
904
905#endif