blob: d4ec34593a7448453a5d444895ee94ef3c38bf26 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00002Copyright (c) 2000, BeOpen.com.
3Copyright (c) 1995-2000, Corporation for National Research Initiatives.
4Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
5All rights reserved.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00006
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00007See the file "Misc/COPYRIGHT" for information on usage and
8redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00009******************************************************************/
10
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000011/* Tokenizer implementation */
12
Guido van Rossum3f5da241990-12-20 15:06:42 +000013#include "pgenheaders.h"
14
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000015#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000016
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#include "tokenizer.h"
18#include "errcode.h"
19
Tim Petersdbd9ba62000-07-09 03:09:57 +000020extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000021/* Return malloc'ed string including trailing \n;
22 empty malloc'ed string for EOF;
23 NULL if interrupted */
24
Guido van Rossum4fe87291992-02-26 15:24:44 +000025/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000026#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000028/* Convert a possibly signed character to a nonnegative int */
29/* XXX This assumes characters are 8 bits wide */
30#ifdef __CHAR_UNSIGNED__
31#define Py_CHARMASK(c) (c)
32#else
33#define Py_CHARMASK(c) ((c) & 0xff)
34#endif
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000037static struct tok_state *tok_new(void);
38static int tok_nextc(struct tok_state *tok);
39static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000040
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041/* Token names */
42
Guido van Rossum86bea461997-04-29 21:03:06 +000043char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044 "ENDMARKER",
45 "NAME",
46 "NUMBER",
47 "STRING",
48 "NEWLINE",
49 "INDENT",
50 "DEDENT",
51 "LPAR",
52 "RPAR",
53 "LSQB",
54 "RSQB",
55 "COLON",
56 "COMMA",
57 "SEMI",
58 "PLUS",
59 "MINUS",
60 "STAR",
61 "SLASH",
62 "VBAR",
63 "AMPER",
64 "LESS",
65 "GREATER",
66 "EQUAL",
67 "DOT",
68 "PERCENT",
69 "BACKQUOTE",
70 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Guido van Rossumfbab9051991-10-20 20:25:03 +000081 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000082 "OP",
83 "<ERRORTOKEN>",
84 "<N_TOKENS>"
85};
86
87
88/* Create and initialize a new tok_state structure */
89
90static struct tok_state *
91tok_new()
92{
Guido van Rossum86bea461997-04-29 21:03:06 +000093 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000094 if (tok == NULL)
95 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +000096 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 tok->done = E_OK;
98 tok->fp = NULL;
99 tok->tabsize = TABSIZE;
100 tok->indent = 0;
101 tok->indstack[0] = 0;
102 tok->atbol = 1;
103 tok->pendin = 0;
104 tok->prompt = tok->nextprompt = NULL;
105 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000106 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000107 tok->filename = NULL;
108 tok->altwarning = 0;
109 tok->alterror = 0;
110 tok->alttabsize = 1;
111 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000119PyTokenizer_FromString(str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 char *str;
121{
122 struct tok_state *tok = tok_new();
123 if (tok == NULL)
124 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000125 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000126 return tok;
127}
128
129
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000130/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131
132struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000133PyTokenizer_FromFile(fp, ps1, ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000134 FILE *fp;
135 char *ps1, *ps2;
136{
137 struct tok_state *tok = tok_new();
138 if (tok == NULL)
139 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000140 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
141 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000142 return NULL;
143 }
144 tok->cur = tok->inp = tok->buf;
145 tok->end = tok->buf + BUFSIZ;
146 tok->fp = fp;
147 tok->prompt = ps1;
148 tok->nextprompt = ps2;
149 return tok;
150}
151
152
153/* Free a tok_state structure */
154
155void
Guido van Rossum86bea461997-04-29 21:03:06 +0000156PyTokenizer_Free(tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000157 struct tok_state *tok;
158{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000160 PyMem_DEL(tok->buf);
161 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
168tok_nextc(tok)
169 register struct tok_state *tok;
170{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000171 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000172 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000173 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000174 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000175 if (tok->done != E_OK)
176 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
186 }
187 }
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000192 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000194 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000195 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000201 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 tok->done = E_EOF;
203 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000205 size_t start = tok->start - tok->buf;
206 size_t oldlen = tok->cur - tok->buf;
207 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000208 char *buf = tok->buf;
209 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000210 tok->lineno++;
211 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000212 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000213 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000214 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000215 tok->done = E_NOMEM;
216 return EOF;
217 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000218 tok->buf = buf;
219 tok->cur = tok->buf + oldlen;
220 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000221 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000222 tok->inp = tok->buf + newlen;
223 tok->end = tok->inp + 1;
224 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000225 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000226 else {
227 tok->lineno++;
228 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000230 tok->buf = new;
231 tok->cur = tok->buf;
232 tok->inp = strchr(tok->buf, '\0');
233 tok->end = tok->inp + 1;
234 }
235 }
236 else {
237 int done = 0;
238 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000239 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000240 if (tok->start == NULL) {
241 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000242 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000243 if (tok->buf == NULL) {
244 tok->done = E_NOMEM;
245 return EOF;
246 }
247 tok->end = tok->buf + BUFSIZ;
248 }
249 if (fgets(tok->buf, (int)(tok->end - tok->buf),
250 tok->fp) == NULL) {
251 tok->done = E_EOF;
252 done = 1;
253 }
254 else {
255 tok->done = E_OK;
256 tok->inp = strchr(tok->buf, '\0');
257 done = tok->inp[-1] == '\n';
258 }
259 }
260 else {
261 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000262 if (feof(tok->fp)) {
263 tok->done = E_EOF;
264 done = 1;
265 }
266 else
267 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000268 }
269 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000270 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000271 while (!done) {
272 int curstart = tok->start == NULL ? -1 :
273 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000274 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000275 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000276 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000277 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000278 if (newbuf == NULL) {
279 tok->done = E_NOMEM;
280 tok->cur = tok->inp;
281 return EOF;
282 }
283 tok->buf = newbuf;
284 tok->inp = tok->buf + curvalid;
285 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000286 tok->start = curstart < 0 ? NULL :
287 tok->buf + curstart;
288 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000289 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000290 tok->fp) == NULL) {
291 /* Last line does not end in \n,
292 fake one */
293 strcpy(tok->inp, "\n");
294 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000295 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000296 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000297 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000298 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000299#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000300 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000301 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000302 pt = tok->inp - 2;
303 if (pt >= tok->buf && *pt == '\r') {
304 *pt++ = '\n';
305 *pt = '\0';
306 tok->inp = pt;
307 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000308#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000309 }
310 if (tok->done != E_OK) {
311 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000312 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000313 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000314 return EOF;
315 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000316 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000317 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000318}
319
320
321/* Back-up one character */
322
323static void
324tok_backup(tok, c)
325 register struct tok_state *tok;
326 register int c;
327{
328 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000329 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000330 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000331 if (*tok->cur != c)
332 *tok->cur = c;
333 }
334}
335
336
337/* Return the token corresponding to a single character */
338
339int
Guido van Rossum86bea461997-04-29 21:03:06 +0000340PyToken_OneChar(c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000341 int c;
342{
343 switch (c) {
344 case '(': return LPAR;
345 case ')': return RPAR;
346 case '[': return LSQB;
347 case ']': return RSQB;
348 case ':': return COLON;
349 case ',': return COMMA;
350 case ';': return SEMI;
351 case '+': return PLUS;
352 case '-': return MINUS;
353 case '*': return STAR;
354 case '/': return SLASH;
355 case '|': return VBAR;
356 case '&': return AMPER;
357 case '<': return LESS;
358 case '>': return GREATER;
359 case '=': return EQUAL;
360 case '.': return DOT;
361 case '%': return PERCENT;
362 case '`': return BACKQUOTE;
363 case '{': return LBRACE;
364 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000365 case '^': return CIRCUMFLEX;
366 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000367 default: return OP;
368 }
369}
370
371
Guido van Rossumfbab9051991-10-20 20:25:03 +0000372int
Guido van Rossum86bea461997-04-29 21:03:06 +0000373PyToken_TwoChars(c1, c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000374 int c1, c2;
375{
376 switch (c1) {
377 case '=':
378 switch (c2) {
379 case '=': return EQEQUAL;
380 }
381 break;
382 case '!':
383 switch (c2) {
384 case '=': return NOTEQUAL;
385 }
386 break;
387 case '<':
388 switch (c2) {
389 case '>': return NOTEQUAL;
390 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000391 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000392 }
393 break;
394 case '>':
395 switch (c2) {
396 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000397 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000398 }
399 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000400 case '*':
401 switch (c2) {
402 case '*': return DOUBLESTAR;
403 }
404 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000405 }
406 return OP;
407}
408
409
Guido van Rossum926f13a1998-04-09 21:38:06 +0000410static int
411indenterror(tok)
412 struct tok_state *tok;
413{
414 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000415 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000416 tok->cur = tok->inp;
417 return 1;
418 }
419 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000420 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
421 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000422 tok->altwarning = 0;
423 }
424 return 0;
425}
426
427
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000428/* Get next token, after space stripping etc. */
429
430int
Guido van Rossum86bea461997-04-29 21:03:06 +0000431PyTokenizer_Get(tok, p_start, p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000432 register struct tok_state *tok; /* In/out: tokenizer state */
433 char **p_start, **p_end; /* Out: point to start/end of token */
434{
435 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000436 int blankline;
437
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000438 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000439 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000440 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000441 blankline = 0;
442
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000443 /* Get indentation level */
444 if (tok->atbol) {
445 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000446 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000447 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000448 for (;;) {
449 c = tok_nextc(tok);
450 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000451 col++, altcol++;
452 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000453 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000454 altcol = (altcol/tok->alttabsize + 1)
455 * tok->alttabsize;
456 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000457 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000458 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000459 else
460 break;
461 }
462 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000463 if (c == '#' || c == '\n') {
464 /* Lines with only whitespace and/or comments
465 shouldn't affect the indentation and are
466 not passed to the parser as NEWLINE tokens,
467 except *totally* empty lines in interactive
468 mode, which signal the end of a command group. */
469 if (col == 0 && c == '\n' && tok->prompt != NULL)
470 blankline = 0; /* Let it through */
471 else
472 blankline = 1; /* Ignore completely */
473 /* We can't jump back right here since we still
474 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000475 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000476 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000477 if (col == tok->indstack[tok->indent]) {
478 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000479 if (altcol != tok->altindstack[tok->indent]) {
480 if (indenterror(tok))
481 return ERRORTOKEN;
482 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000483 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000484 else if (col > tok->indstack[tok->indent]) {
485 /* Indent -- always one */
486 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000487 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000488 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000489 return ERRORTOKEN;
490 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000491 if (altcol <= tok->altindstack[tok->indent]) {
492 if (indenterror(tok))
493 return ERRORTOKEN;
494 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000495 tok->pendin++;
496 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000497 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000498 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000499 else /* col < tok->indstack[tok->indent] */ {
500 /* Dedent -- any number, must be consistent */
501 while (tok->indent > 0 &&
502 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000503 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000504 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000505 }
506 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000507 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000508 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000509 return ERRORTOKEN;
510 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000511 if (altcol != tok->altindstack[tok->indent]) {
512 if (indenterror(tok))
513 return ERRORTOKEN;
514 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000515 }
516 }
517 }
518
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000519 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000520
521 /* Return pending indents/dedents */
522 if (tok->pendin != 0) {
523 if (tok->pendin < 0) {
524 tok->pendin++;
525 return DEDENT;
526 }
527 else {
528 tok->pendin--;
529 return INDENT;
530 }
531 }
532
533 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000534 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000535 /* Skip spaces */
536 do {
537 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000538 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000539
540 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000541 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000542
Guido van Rossumab5ca152000-03-31 00:52:27 +0000543 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000544 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000545 static char *tabforms[] = {
546 "tab-width:", /* Emacs */
547 ":tabstop=", /* vim, full form */
548 ":ts=", /* vim, abbreviated form */
549 "set tabsize=", /* will vi never die? */
550 /* more templates can be added here to support other editors */
551 };
552 char cbuf[80];
553 char *tp, **cp;
554 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000555 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000556 *tp++ = c = tok_nextc(tok);
557 } while (c != EOF && c != '\n' &&
558 tp - cbuf + 1 < sizeof(cbuf));
559 *tp = '\0';
560 for (cp = tabforms;
561 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
562 cp++) {
563 if ((tp = strstr(cbuf, *cp))) {
564 int newsize = atoi(tp + strlen(*cp));
565
566 if (newsize >= 1 && newsize <= 40) {
567 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000568 if (Py_VerboseFlag)
569 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000570 "Tab size set to %d\n",
571 newsize);
572 }
573 }
574 }
575 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000576 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000577 }
578
579 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000580 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000581 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000582 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000583
584 /* Identifier (most frequent token!) */
585 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000586 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000587 switch (c) {
588 case 'r':
589 case 'R':
590 c = tok_nextc(tok);
591 if (c == '"' || c == '\'')
592 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000593 break;
594 case 'u':
595 case 'U':
596 c = tok_nextc(tok);
597 if (c == 'r' || c == 'R')
598 c = tok_nextc(tok);
599 if (c == '"' || c == '\'')
600 goto letter_quote;
601 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000602 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000603 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000604 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000605 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000607 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608 *p_end = tok->cur;
609 return NAME;
610 }
611
612 /* Newline */
613 if (c == '\n') {
614 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000615 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000616 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000617 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000618 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
619 return NEWLINE;
620 }
621
Guido van Rossum2d45be11997-04-11 19:16:25 +0000622#ifdef macintosh
623 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000624 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000625 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000626 tok->done = E_TOKEN;
627 tok->cur = tok->inp;
628 return ERRORTOKEN;
629 }
630#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000631 /* Period or number starting with period? */
632 if (c == '.') {
633 c = tok_nextc(tok);
634 if (isdigit(c)) {
635 goto fraction;
636 }
637 else {
638 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000639 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000640 *p_end = tok->cur;
641 return DOT;
642 }
643 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000644
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645 /* Number */
646 if (isdigit(c)) {
647 if (c == '0') {
648 /* Hex or octal */
649 c = tok_nextc(tok);
650 if (c == '.')
651 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000652#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000653 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000654 goto imaginary;
655#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 if (c == 'x' || c == 'X') {
657 /* Hex */
658 do {
659 c = tok_nextc(tok);
660 } while (isxdigit(c));
661 }
662 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000663 /* XXX This is broken! E.g.,
664 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000665 /* Octal; c is first char of it */
666 /* There's no 'isoctdigit' macro, sigh */
667 while ('0' <= c && c < '8') {
668 c = tok_nextc(tok);
669 }
670 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000671 if (c == 'l' || c == 'L')
672 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 }
674 else {
675 /* Decimal */
676 do {
677 c = tok_nextc(tok);
678 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000679 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000680 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000681 else {
682 /* Accept floating point numbers.
683 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000684 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000685 if (c == '.') {
686 fraction:
687 /* Fraction */
688 do {
689 c = tok_nextc(tok);
690 } while (isdigit(c));
691 }
692 if (c == 'e' || c == 'E') {
693 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000694 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000695 if (c == '+' || c == '-')
696 c = tok_nextc(tok);
697 while (isdigit(c)) {
698 c = tok_nextc(tok);
699 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000700 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000701#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000702 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000703 /* Imaginary part */
704 imaginary:
705 c = tok_nextc(tok);
706#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000707 }
708 }
709 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000710 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711 *p_end = tok->cur;
712 return NUMBER;
713 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000714
715 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000716 /* String */
717 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000718 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000719 int quote = c;
720 int triple = 0;
721 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722 for (;;) {
723 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000724 if (c == '\n') {
725 if (!triple) {
726 tok->done = E_TOKEN;
727 tok_backup(tok, c);
728 return ERRORTOKEN;
729 }
730 tripcount = 0;
731 }
732 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000733 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000734 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735 return ERRORTOKEN;
736 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000737 else if (c == quote) {
738 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000739 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000740 c = tok_nextc(tok);
741 if (c == quote) {
742 triple = 1;
743 tripcount = 0;
744 continue;
745 }
746 tok_backup(tok, c);
747 }
748 if (!triple || tripcount == 3)
749 break;
750 }
751 else if (c == '\\') {
752 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000754 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000756 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757 return ERRORTOKEN;
758 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000760 else
761 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000763 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000764 *p_end = tok->cur;
765 return STRING;
766 }
767
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768 /* Line continuation */
769 if (c == '\\') {
770 c = tok_nextc(tok);
771 if (c != '\n') {
772 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000773 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 return ERRORTOKEN;
775 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 goto again; /* Read next line */
777 }
778
Guido van Rossumfbab9051991-10-20 20:25:03 +0000779 /* Check for two-character token */
780 {
781 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000782 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000783 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000784 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000785 *p_end = tok->cur;
786 return token;
787 }
788 tok_backup(tok, c2);
789 }
790
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000792 switch (c) {
793 case '(':
794 case '[':
795 case '{':
796 tok->level++;
797 break;
798 case ')':
799 case ']':
800 case '}':
801 tok->level--;
802 break;
803 }
804
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000806 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000808 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809}
810
811
Guido van Rossum408027e1996-12-30 16:17:54 +0000812#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813
814void
815tok_dump(type, start, end)
816 int type;
817 char *start, *end;
818{
Guido van Rossum86bea461997-04-29 21:03:06 +0000819 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820 if (type == NAME || type == NUMBER || type == STRING || type == OP)
821 printf("(%.*s)", (int)(end - start), start);
822}
823
824#endif