blob: 0ef3fc00ff835e43f564e584e6464aae4a8f144c [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00002Copyright (c) 2000, BeOpen.com.
3Copyright (c) 1995-2000, Corporation for National Research Initiatives.
4Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
5All rights reserved.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00006
Guido van Rossumfd71b9e2000-06-30 23:50:40 +00007See the file "Misc/COPYRIGHT" for information on usage and
8redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00009******************************************************************/
10
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000011/* Tokenizer implementation */
12
Guido van Rossum3f5da241990-12-20 15:06:42 +000013#include "pgenheaders.h"
14
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000015#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000016
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000017#include "tokenizer.h"
18#include "errcode.h"
19
Tim Petersdbd9ba62000-07-09 03:09:57 +000020extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000021/* Return malloc'ed string including trailing \n;
22 empty malloc'ed string for EOF;
23 NULL if interrupted */
24
Guido van Rossum4fe87291992-02-26 15:24:44 +000025/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000026#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000028/* Convert a possibly signed character to a nonnegative int */
29/* XXX This assumes characters are 8 bits wide */
30#ifdef __CHAR_UNSIGNED__
31#define Py_CHARMASK(c) (c)
32#else
33#define Py_CHARMASK(c) ((c) & 0xff)
34#endif
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000037static struct tok_state *tok_new(void);
38static int tok_nextc(struct tok_state *tok);
39static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000040
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041/* Token names */
42
Guido van Rossum86bea461997-04-29 21:03:06 +000043char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044 "ENDMARKER",
45 "NAME",
46 "NUMBER",
47 "STRING",
48 "NEWLINE",
49 "INDENT",
50 "DEDENT",
51 "LPAR",
52 "RPAR",
53 "LSQB",
54 "RSQB",
55 "COLON",
56 "COMMA",
57 "SEMI",
58 "PLUS",
59 "MINUS",
60 "STAR",
61 "SLASH",
62 "VBAR",
63 "AMPER",
64 "LESS",
65 "GREATER",
66 "EQUAL",
67 "DOT",
68 "PERCENT",
69 "BACKQUOTE",
70 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Guido van Rossumfbab9051991-10-20 20:25:03 +000081 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000082 "OP",
83 "<ERRORTOKEN>",
84 "<N_TOKENS>"
85};
86
87
88/* Create and initialize a new tok_state structure */
89
90static struct tok_state *
91tok_new()
92{
Guido van Rossum86bea461997-04-29 21:03:06 +000093 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000094 if (tok == NULL)
95 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +000096 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 tok->done = E_OK;
98 tok->fp = NULL;
99 tok->tabsize = TABSIZE;
100 tok->indent = 0;
101 tok->indstack[0] = 0;
102 tok->atbol = 1;
103 tok->pendin = 0;
104 tok->prompt = tok->nextprompt = NULL;
105 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000106 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000107 tok->filename = NULL;
108 tok->altwarning = 0;
109 tok->alterror = 0;
110 tok->alttabsize = 1;
111 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000119PyTokenizer_FromString(str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 char *str;
121{
122 struct tok_state *tok = tok_new();
123 if (tok == NULL)
124 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000125 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000126 return tok;
127}
128
129
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000130/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131
132struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000133PyTokenizer_FromFile(fp, ps1, ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000134 FILE *fp;
135 char *ps1, *ps2;
136{
137 struct tok_state *tok = tok_new();
138 if (tok == NULL)
139 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000140 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
141 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000142 return NULL;
143 }
144 tok->cur = tok->inp = tok->buf;
145 tok->end = tok->buf + BUFSIZ;
146 tok->fp = fp;
147 tok->prompt = ps1;
148 tok->nextprompt = ps2;
149 return tok;
150}
151
152
153/* Free a tok_state structure */
154
155void
Guido van Rossum86bea461997-04-29 21:03:06 +0000156PyTokenizer_Free(tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000157 struct tok_state *tok;
158{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000160 PyMem_DEL(tok->buf);
161 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
168tok_nextc(tok)
169 register struct tok_state *tok;
170{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000171 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000172 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000173 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000174 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000175 if (tok->done != E_OK)
176 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
186 }
187 }
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000192 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000194 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000195 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000201 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 tok->done = E_EOF;
203 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000205 size_t start = tok->start - tok->buf;
206 size_t oldlen = tok->cur - tok->buf;
207 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000208 char *buf = tok->buf;
209 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000210 tok->lineno++;
211 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000212 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000213 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000214 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000215 tok->done = E_NOMEM;
216 return EOF;
217 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000218 tok->buf = buf;
219 tok->cur = tok->buf + oldlen;
220 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000221 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000222 tok->inp = tok->buf + newlen;
223 tok->end = tok->inp + 1;
224 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000225 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000226 else {
227 tok->lineno++;
228 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000230 tok->buf = new;
231 tok->cur = tok->buf;
232 tok->inp = strchr(tok->buf, '\0');
233 tok->end = tok->inp + 1;
234 }
235 }
236 else {
237 int done = 0;
238 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000239 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000240 if (tok->start == NULL) {
241 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000242 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000243 if (tok->buf == NULL) {
244 tok->done = E_NOMEM;
245 return EOF;
246 }
247 tok->end = tok->buf + BUFSIZ;
248 }
249 if (fgets(tok->buf, (int)(tok->end - tok->buf),
250 tok->fp) == NULL) {
251 tok->done = E_EOF;
252 done = 1;
253 }
254 else {
255 tok->done = E_OK;
256 tok->inp = strchr(tok->buf, '\0');
257 done = tok->inp[-1] == '\n';
258 }
259 }
260 else {
261 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000262 if (feof(tok->fp)) {
263 tok->done = E_EOF;
264 done = 1;
265 }
266 else
267 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000268 }
269 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000270 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000271 while (!done) {
272 int curstart = tok->start == NULL ? -1 :
273 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000274 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000275 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000276 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000277 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000278 if (newbuf == NULL) {
279 tok->done = E_NOMEM;
280 tok->cur = tok->inp;
281 return EOF;
282 }
283 tok->buf = newbuf;
284 tok->inp = tok->buf + curvalid;
285 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000286 tok->start = curstart < 0 ? NULL :
287 tok->buf + curstart;
288 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000289 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000290 tok->fp) == NULL) {
291 /* Last line does not end in \n,
292 fake one */
293 strcpy(tok->inp, "\n");
294 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000295 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000296 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000297 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000298 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000299#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000300 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000301 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000302 pt = tok->inp - 2;
303 if (pt >= tok->buf && *pt == '\r') {
304 *pt++ = '\n';
305 *pt = '\0';
306 tok->inp = pt;
307 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000308#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000309 }
310 if (tok->done != E_OK) {
311 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000312 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000313 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000314 return EOF;
315 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000316 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000317 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000318}
319
320
321/* Back-up one character */
322
323static void
324tok_backup(tok, c)
325 register struct tok_state *tok;
326 register int c;
327{
328 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000329 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000330 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000331 if (*tok->cur != c)
332 *tok->cur = c;
333 }
334}
335
336
337/* Return the token corresponding to a single character */
338
339int
Guido van Rossum86bea461997-04-29 21:03:06 +0000340PyToken_OneChar(c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000341 int c;
342{
343 switch (c) {
344 case '(': return LPAR;
345 case ')': return RPAR;
346 case '[': return LSQB;
347 case ']': return RSQB;
348 case ':': return COLON;
349 case ',': return COMMA;
350 case ';': return SEMI;
351 case '+': return PLUS;
352 case '-': return MINUS;
353 case '*': return STAR;
354 case '/': return SLASH;
355 case '|': return VBAR;
356 case '&': return AMPER;
357 case '<': return LESS;
358 case '>': return GREATER;
359 case '=': return EQUAL;
360 case '.': return DOT;
361 case '%': return PERCENT;
362 case '`': return BACKQUOTE;
363 case '{': return LBRACE;
364 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000365 case '^': return CIRCUMFLEX;
366 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000367 default: return OP;
368 }
369}
370
371
Guido van Rossumfbab9051991-10-20 20:25:03 +0000372int
Guido van Rossum86bea461997-04-29 21:03:06 +0000373PyToken_TwoChars(c1, c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000374 int c1, c2;
375{
376 switch (c1) {
377 case '=':
378 switch (c2) {
379 case '=': return EQEQUAL;
380 }
381 break;
382 case '!':
383 switch (c2) {
384 case '=': return NOTEQUAL;
385 }
386 break;
387 case '<':
388 switch (c2) {
389 case '>': return NOTEQUAL;
390 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000391 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000392 }
393 break;
394 case '>':
395 switch (c2) {
396 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000397 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000398 }
399 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000400 case '*':
401 switch (c2) {
402 case '*': return DOUBLESTAR;
403 }
404 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000405 }
406 return OP;
407}
408
409
Guido van Rossum926f13a1998-04-09 21:38:06 +0000410static int
411indenterror(tok)
412 struct tok_state *tok;
413{
414 if (tok->alterror) {
415 tok->done = E_INDENT;
416 tok->cur = tok->inp;
417 return 1;
418 }
419 if (tok->altwarning) {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000420 PySys_WriteStderr("%s: inconsistent tab/space usage\n",
Guido van Rossum926f13a1998-04-09 21:38:06 +0000421 tok->filename);
422 tok->altwarning = 0;
423 }
424 return 0;
425}
426
427
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000428/* Get next token, after space stripping etc. */
429
430int
Guido van Rossum86bea461997-04-29 21:03:06 +0000431PyTokenizer_Get(tok, p_start, p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000432 register struct tok_state *tok; /* In/out: tokenizer state */
433 char **p_start, **p_end; /* Out: point to start/end of token */
434{
435 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000436 int blankline;
437
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000438 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000439 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000440 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000441 blankline = 0;
442
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000443 /* Get indentation level */
444 if (tok->atbol) {
445 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000446 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000447 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000448 for (;;) {
449 c = tok_nextc(tok);
450 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000451 col++, altcol++;
452 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000453 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000454 altcol = (altcol/tok->alttabsize + 1)
455 * tok->alttabsize;
456 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000457 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000458 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000459 else
460 break;
461 }
462 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000463 if (c == '#' || c == '\n') {
464 /* Lines with only whitespace and/or comments
465 shouldn't affect the indentation and are
466 not passed to the parser as NEWLINE tokens,
467 except *totally* empty lines in interactive
468 mode, which signal the end of a command group. */
469 if (col == 0 && c == '\n' && tok->prompt != NULL)
470 blankline = 0; /* Let it through */
471 else
472 blankline = 1; /* Ignore completely */
473 /* We can't jump back right here since we still
474 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000475 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000476 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000477 if (col == tok->indstack[tok->indent]) {
478 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000479 if (altcol != tok->altindstack[tok->indent]) {
480 if (indenterror(tok))
481 return ERRORTOKEN;
482 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000483 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000484 else if (col > tok->indstack[tok->indent]) {
485 /* Indent -- always one */
486 if (tok->indent+1 >= MAXINDENT) {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000487 PySys_WriteStderr(
488 "excessive indent\n");
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000489 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000490 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000491 return ERRORTOKEN;
492 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000493 if (altcol <= tok->altindstack[tok->indent]) {
494 if (indenterror(tok))
495 return ERRORTOKEN;
496 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000497 tok->pendin++;
498 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000499 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000500 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000501 else /* col < tok->indstack[tok->indent] */ {
502 /* Dedent -- any number, must be consistent */
503 while (tok->indent > 0 &&
504 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000505 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000506 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000507 }
508 if (col != tok->indstack[tok->indent]) {
Guido van Rossumd5516bc1998-12-04 18:51:01 +0000509 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000510 "inconsistent dedent\n");
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000511 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000512 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000513 return ERRORTOKEN;
514 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000515 if (altcol != tok->altindstack[tok->indent]) {
516 if (indenterror(tok))
517 return ERRORTOKEN;
518 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000519 }
520 }
521 }
522
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000523 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000524
525 /* Return pending indents/dedents */
526 if (tok->pendin != 0) {
527 if (tok->pendin < 0) {
528 tok->pendin++;
529 return DEDENT;
530 }
531 else {
532 tok->pendin--;
533 return INDENT;
534 }
535 }
536
537 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000538 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000539 /* Skip spaces */
540 do {
541 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000542 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000543
544 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000545 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000546
Guido van Rossumab5ca152000-03-31 00:52:27 +0000547 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000548 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000549 static char *tabforms[] = {
550 "tab-width:", /* Emacs */
551 ":tabstop=", /* vim, full form */
552 ":ts=", /* vim, abbreviated form */
553 "set tabsize=", /* will vi never die? */
554 /* more templates can be added here to support other editors */
555 };
556 char cbuf[80];
557 char *tp, **cp;
558 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000559 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000560 *tp++ = c = tok_nextc(tok);
561 } while (c != EOF && c != '\n' &&
562 tp - cbuf + 1 < sizeof(cbuf));
563 *tp = '\0';
564 for (cp = tabforms;
565 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
566 cp++) {
567 if ((tp = strstr(cbuf, *cp))) {
568 int newsize = atoi(tp + strlen(*cp));
569
570 if (newsize >= 1 && newsize <= 40) {
571 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000572 if (Py_VerboseFlag)
573 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000574 "Tab size set to %d\n",
575 newsize);
576 }
577 }
578 }
579 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000580 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000581 }
582
583 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000584 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000585 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000586 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000587
588 /* Identifier (most frequent token!) */
589 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000590 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000591 switch (c) {
592 case 'r':
593 case 'R':
594 c = tok_nextc(tok);
595 if (c == '"' || c == '\'')
596 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000597 break;
598 case 'u':
599 case 'U':
600 c = tok_nextc(tok);
601 if (c == 'r' || c == 'R')
602 c = tok_nextc(tok);
603 if (c == '"' || c == '\'')
604 goto letter_quote;
605 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000606 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000607 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000609 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000610 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000611 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000612 *p_end = tok->cur;
613 return NAME;
614 }
615
616 /* Newline */
617 if (c == '\n') {
618 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000619 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000620 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000621 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
623 return NEWLINE;
624 }
625
Guido van Rossum2d45be11997-04-11 19:16:25 +0000626#ifdef macintosh
627 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000628 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000629 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000630 tok->done = E_TOKEN;
631 tok->cur = tok->inp;
632 return ERRORTOKEN;
633 }
634#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000635 /* Period or number starting with period? */
636 if (c == '.') {
637 c = tok_nextc(tok);
638 if (isdigit(c)) {
639 goto fraction;
640 }
641 else {
642 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000643 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000644 *p_end = tok->cur;
645 return DOT;
646 }
647 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000648
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649 /* Number */
650 if (isdigit(c)) {
651 if (c == '0') {
652 /* Hex or octal */
653 c = tok_nextc(tok);
654 if (c == '.')
655 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000656#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000657 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000658 goto imaginary;
659#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660 if (c == 'x' || c == 'X') {
661 /* Hex */
662 do {
663 c = tok_nextc(tok);
664 } while (isxdigit(c));
665 }
666 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000667 /* XXX This is broken! E.g.,
668 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000669 /* Octal; c is first char of it */
670 /* There's no 'isoctdigit' macro, sigh */
671 while ('0' <= c && c < '8') {
672 c = tok_nextc(tok);
673 }
674 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000675 if (c == 'l' || c == 'L')
676 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677 }
678 else {
679 /* Decimal */
680 do {
681 c = tok_nextc(tok);
682 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000683 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000685 else {
686 /* Accept floating point numbers.
687 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000688 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000689 if (c == '.') {
690 fraction:
691 /* Fraction */
692 do {
693 c = tok_nextc(tok);
694 } while (isdigit(c));
695 }
696 if (c == 'e' || c == 'E') {
697 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000698 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000699 if (c == '+' || c == '-')
700 c = tok_nextc(tok);
701 while (isdigit(c)) {
702 c = tok_nextc(tok);
703 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000704 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000705#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000706 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000707 /* Imaginary part */
708 imaginary:
709 c = tok_nextc(tok);
710#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711 }
712 }
713 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000714 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715 *p_end = tok->cur;
716 return NUMBER;
717 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000718
719 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000720 /* String */
721 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000722 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000723 int quote = c;
724 int triple = 0;
725 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726 for (;;) {
727 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000728 if (c == '\n') {
729 if (!triple) {
730 tok->done = E_TOKEN;
731 tok_backup(tok, c);
732 return ERRORTOKEN;
733 }
734 tripcount = 0;
735 }
736 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000738 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 return ERRORTOKEN;
740 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000741 else if (c == quote) {
742 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000743 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000744 c = tok_nextc(tok);
745 if (c == quote) {
746 triple = 1;
747 tripcount = 0;
748 continue;
749 }
750 tok_backup(tok, c);
751 }
752 if (!triple || tripcount == 3)
753 break;
754 }
755 else if (c == '\\') {
756 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000758 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000760 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 return ERRORTOKEN;
762 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000764 else
765 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000767 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000768 *p_end = tok->cur;
769 return STRING;
770 }
771
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772 /* Line continuation */
773 if (c == '\\') {
774 c = tok_nextc(tok);
775 if (c != '\n') {
776 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000777 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 return ERRORTOKEN;
779 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780 goto again; /* Read next line */
781 }
782
Guido van Rossumfbab9051991-10-20 20:25:03 +0000783 /* Check for two-character token */
784 {
785 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000786 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000787 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000788 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000789 *p_end = tok->cur;
790 return token;
791 }
792 tok_backup(tok, c2);
793 }
794
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000795 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000796 switch (c) {
797 case '(':
798 case '[':
799 case '{':
800 tok->level++;
801 break;
802 case ')':
803 case ']':
804 case '}':
805 tok->level--;
806 break;
807 }
808
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000810 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000812 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813}
814
815
Guido van Rossum408027e1996-12-30 16:17:54 +0000816#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817
818void
819tok_dump(type, start, end)
820 int type;
821 char *start, *end;
822{
Guido van Rossum86bea461997-04-29 21:03:06 +0000823 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824 if (type == NAME || type == NUMBER || type == STRING || type == OP)
825 printf("(%.*s)", (int)(end - start), start);
826}
827
828#endif