blob: b4e0fbf7e58d9446e02d3f5af619b42b0191f2fa [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00008
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009#include "tokenizer.h"
10#include "errcode.h"
11
Tim Petersdbd9ba62000-07-09 03:09:57 +000012extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000013/* Return malloc'ed string including trailing \n;
14 empty malloc'ed string for EOF;
15 NULL if interrupted */
16
Guido van Rossum4fe87291992-02-26 15:24:44 +000017/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000018#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000019
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000020/* Convert a possibly signed character to a nonnegative int */
21/* XXX This assumes characters are 8 bits wide */
22#ifdef __CHAR_UNSIGNED__
23#define Py_CHARMASK(c) (c)
24#else
25#define Py_CHARMASK(c) ((c) & 0xff)
26#endif
27
Guido van Rossum3f5da241990-12-20 15:06:42 +000028/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000029static struct tok_state *tok_new(void);
30static int tok_nextc(struct tok_state *tok);
31static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000032
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033/* Token names */
34
Guido van Rossum86bea461997-04-29 21:03:06 +000035char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036 "ENDMARKER",
37 "NAME",
38 "NUMBER",
39 "STRING",
40 "NEWLINE",
41 "INDENT",
42 "DEDENT",
43 "LPAR",
44 "RPAR",
45 "LSQB",
46 "RSQB",
47 "COLON",
48 "COMMA",
49 "SEMI",
50 "PLUS",
51 "MINUS",
52 "STAR",
53 "SLASH",
54 "VBAR",
55 "AMPER",
56 "LESS",
57 "GREATER",
58 "EQUAL",
59 "DOT",
60 "PERCENT",
61 "BACKQUOTE",
62 "LBRACE",
63 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000064 "EQEQUAL",
65 "NOTEQUAL",
66 "LESSEQUAL",
67 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000068 "TILDE",
69 "CIRCUMFLEX",
70 "LEFTSHIFT",
71 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000072 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000073 "PLUSEQUAL",
74 "MINEQUAL",
75 "STAREQUAL",
76 "SLASHEQUAL",
77 "PERCENTEQUAL",
78 "AMPEREQUAL",
79 "VBAREQUAL",
80 "CIRCUMFLEXEQUAL",
81 "LEFTSHIFTEQUAL",
82 "RIGHTSHIFTEQUAL",
83 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000084 "DOUBLESLASH",
85 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000087 "OP",
88 "<ERRORTOKEN>",
89 "<N_TOKENS>"
90};
91
92
93/* Create and initialize a new tok_state structure */
94
95static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000096tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097{
Guido van Rossum86bea461997-04-29 21:03:06 +000098 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099 if (tok == NULL)
100 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000101 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000102 tok->done = E_OK;
103 tok->fp = NULL;
104 tok->tabsize = TABSIZE;
105 tok->indent = 0;
106 tok->indstack[0] = 0;
107 tok->atbol = 1;
108 tok->pendin = 0;
109 tok->prompt = tok->nextprompt = NULL;
110 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000111 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000112 tok->filename = NULL;
113 tok->altwarning = 0;
114 tok->alterror = 0;
115 tok->alttabsize = 1;
116 tok->altindstack[0] = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 return tok;
118}
119
120
121/* Set up tokenizer for string */
122
123struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000124PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125{
126 struct tok_state *tok = tok_new();
127 if (tok == NULL)
128 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000129 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
133
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000134/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135
136struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000137PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138{
139 struct tok_state *tok = tok_new();
140 if (tok == NULL)
141 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000142 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
143 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000144 return NULL;
145 }
146 tok->cur = tok->inp = tok->buf;
147 tok->end = tok->buf + BUFSIZ;
148 tok->fp = fp;
149 tok->prompt = ps1;
150 tok->nextprompt = ps2;
151 return tok;
152}
153
154
155/* Free a tok_state structure */
156
157void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000158PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000160 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000161 PyMem_DEL(tok->buf);
162 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000163}
164
165
166/* Get next char, updating state; error code goes into tok->done */
167
168static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000169tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000170{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000171 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000172 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000173 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000174 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000175 if (tok->done != E_OK)
176 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
186 }
187 }
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000192 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000194 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000195 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000201 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 tok->done = E_EOF;
203 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000205 size_t start = tok->start - tok->buf;
206 size_t oldlen = tok->cur - tok->buf;
207 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000208 char *buf = tok->buf;
209 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000210 tok->lineno++;
211 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000212 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000213 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000214 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000215 tok->done = E_NOMEM;
216 return EOF;
217 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000218 tok->buf = buf;
219 tok->cur = tok->buf + oldlen;
220 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000221 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000222 tok->inp = tok->buf + newlen;
223 tok->end = tok->inp + 1;
224 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000225 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000226 else {
227 tok->lineno++;
228 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000230 tok->buf = new;
231 tok->cur = tok->buf;
232 tok->inp = strchr(tok->buf, '\0');
233 tok->end = tok->inp + 1;
234 }
235 }
236 else {
237 int done = 0;
238 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000239 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000240 if (tok->start == NULL) {
241 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000242 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000243 if (tok->buf == NULL) {
244 tok->done = E_NOMEM;
245 return EOF;
246 }
247 tok->end = tok->buf + BUFSIZ;
248 }
Jack Jansen7b8c7542002-04-14 20:12:41 +0000249 if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
250 tok->fp, NULL) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000251 tok->done = E_EOF;
252 done = 1;
253 }
254 else {
255 tok->done = E_OK;
256 tok->inp = strchr(tok->buf, '\0');
257 done = tok->inp[-1] == '\n';
258 }
259 }
260 else {
261 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000262 if (feof(tok->fp)) {
263 tok->done = E_EOF;
264 done = 1;
265 }
266 else
267 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000268 }
269 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000270 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000271 while (!done) {
272 int curstart = tok->start == NULL ? -1 :
273 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000274 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000275 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000276 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000277 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000278 if (newbuf == NULL) {
279 tok->done = E_NOMEM;
280 tok->cur = tok->inp;
281 return EOF;
282 }
283 tok->buf = newbuf;
284 tok->inp = tok->buf + curvalid;
285 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000286 tok->start = curstart < 0 ? NULL :
287 tok->buf + curstart;
Jack Jansen7b8c7542002-04-14 20:12:41 +0000288 if (Py_UniversalNewlineFgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000289 (int)(tok->end - tok->inp),
Jack Jansen7b8c7542002-04-14 20:12:41 +0000290 tok->fp, NULL) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000291 /* Last line does not end in \n,
292 fake one */
293 strcpy(tok->inp, "\n");
294 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000295 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000296 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000297 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000298 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000299#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000300 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000301 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000302 pt = tok->inp - 2;
303 if (pt >= tok->buf && *pt == '\r') {
304 *pt++ = '\n';
305 *pt = '\0';
306 tok->inp = pt;
307 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000308#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000309 }
310 if (tok->done != E_OK) {
311 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000312 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000313 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000314 return EOF;
315 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000316 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000317 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000318}
319
320
321/* Back-up one character */
322
323static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000324tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000325{
326 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000327 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000328 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000329 if (*tok->cur != c)
330 *tok->cur = c;
331 }
332}
333
334
335/* Return the token corresponding to a single character */
336
337int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000338PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000339{
340 switch (c) {
341 case '(': return LPAR;
342 case ')': return RPAR;
343 case '[': return LSQB;
344 case ']': return RSQB;
345 case ':': return COLON;
346 case ',': return COMMA;
347 case ';': return SEMI;
348 case '+': return PLUS;
349 case '-': return MINUS;
350 case '*': return STAR;
351 case '/': return SLASH;
352 case '|': return VBAR;
353 case '&': return AMPER;
354 case '<': return LESS;
355 case '>': return GREATER;
356 case '=': return EQUAL;
357 case '.': return DOT;
358 case '%': return PERCENT;
359 case '`': return BACKQUOTE;
360 case '{': return LBRACE;
361 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000362 case '^': return CIRCUMFLEX;
363 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000364 default: return OP;
365 }
366}
367
368
Guido van Rossumfbab9051991-10-20 20:25:03 +0000369int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000370PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000371{
372 switch (c1) {
373 case '=':
374 switch (c2) {
375 case '=': return EQEQUAL;
376 }
377 break;
378 case '!':
379 switch (c2) {
380 case '=': return NOTEQUAL;
381 }
382 break;
383 case '<':
384 switch (c2) {
385 case '>': return NOTEQUAL;
386 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000387 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000388 }
389 break;
390 case '>':
391 switch (c2) {
392 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000393 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000394 }
395 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000396 case '+':
397 switch (c2) {
398 case '=': return PLUSEQUAL;
399 }
400 break;
401 case '-':
402 switch (c2) {
403 case '=': return MINEQUAL;
404 }
405 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000406 case '*':
407 switch (c2) {
408 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000409 case '=': return STAREQUAL;
410 }
411 break;
412 case '/':
413 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000414 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000415 case '=': return SLASHEQUAL;
416 }
417 break;
418 case '|':
419 switch (c2) {
420 case '=': return VBAREQUAL;
421 }
422 break;
423 case '%':
424 switch (c2) {
425 case '=': return PERCENTEQUAL;
426 }
427 break;
428 case '&':
429 switch (c2) {
430 case '=': return AMPEREQUAL;
431 }
432 break;
433 case '^':
434 switch (c2) {
435 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000436 }
437 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000438 }
439 return OP;
440}
441
Thomas Wouters434d0822000-08-24 20:11:32 +0000442int
443PyToken_ThreeChars(int c1, int c2, int c3)
444{
445 switch (c1) {
446 case '<':
447 switch (c2) {
448 case '<':
449 switch (c3) {
450 case '=':
451 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000452 }
453 break;
454 }
455 break;
456 case '>':
457 switch (c2) {
458 case '>':
459 switch (c3) {
460 case '=':
461 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000462 }
463 break;
464 }
465 break;
466 case '*':
467 switch (c2) {
468 case '*':
469 switch (c3) {
470 case '=':
471 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000472 }
473 break;
474 }
475 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000476 case '/':
477 switch (c2) {
478 case '/':
479 switch (c3) {
480 case '=':
481 return DOUBLESLASHEQUAL;
482 }
483 break;
484 }
485 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000486 }
487 return OP;
488}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000489
Guido van Rossum926f13a1998-04-09 21:38:06 +0000490static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000491indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000492{
493 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000494 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000495 tok->cur = tok->inp;
496 return 1;
497 }
498 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000499 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
500 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000501 tok->altwarning = 0;
502 }
503 return 0;
504}
505
506
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000507/* Get next token, after space stripping etc. */
508
509int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000510PyTokenizer_Get(register struct tok_state *tok, char **p_start,
511 char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000512{
513 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000514 int blankline;
515
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000516 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000517 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000518 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000519 blankline = 0;
520
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000521 /* Get indentation level */
522 if (tok->atbol) {
523 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000524 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000525 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000526 for (;;) {
527 c = tok_nextc(tok);
528 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000529 col++, altcol++;
530 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000531 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000532 altcol = (altcol/tok->alttabsize + 1)
533 * tok->alttabsize;
534 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000535 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000536 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000537 else
538 break;
539 }
540 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000541 if (c == '#' || c == '\n') {
542 /* Lines with only whitespace and/or comments
543 shouldn't affect the indentation and are
544 not passed to the parser as NEWLINE tokens,
545 except *totally* empty lines in interactive
546 mode, which signal the end of a command group. */
547 if (col == 0 && c == '\n' && tok->prompt != NULL)
548 blankline = 0; /* Let it through */
549 else
550 blankline = 1; /* Ignore completely */
551 /* We can't jump back right here since we still
552 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000553 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000554 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000555 if (col == tok->indstack[tok->indent]) {
556 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000557 if (altcol != tok->altindstack[tok->indent]) {
558 if (indenterror(tok))
559 return ERRORTOKEN;
560 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000561 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000562 else if (col > tok->indstack[tok->indent]) {
563 /* Indent -- always one */
564 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000565 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000566 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000567 return ERRORTOKEN;
568 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000569 if (altcol <= tok->altindstack[tok->indent]) {
570 if (indenterror(tok))
571 return ERRORTOKEN;
572 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000573 tok->pendin++;
574 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000575 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000576 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000577 else /* col < tok->indstack[tok->indent] */ {
578 /* Dedent -- any number, must be consistent */
579 while (tok->indent > 0 &&
580 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000581 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +0000582 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000583 }
584 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +0000585 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000586 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000587 return ERRORTOKEN;
588 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000589 if (altcol != tok->altindstack[tok->indent]) {
590 if (indenterror(tok))
591 return ERRORTOKEN;
592 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000593 }
594 }
595 }
596
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000597 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000598
599 /* Return pending indents/dedents */
600 if (tok->pendin != 0) {
601 if (tok->pendin < 0) {
602 tok->pendin++;
603 return DEDENT;
604 }
605 else {
606 tok->pendin--;
607 return INDENT;
608 }
609 }
610
611 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000612 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000613 /* Skip spaces */
614 do {
615 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000616 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617
618 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000619 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000620
Guido van Rossumab5ca152000-03-31 00:52:27 +0000621 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000623 static char *tabforms[] = {
624 "tab-width:", /* Emacs */
625 ":tabstop=", /* vim, full form */
626 ":ts=", /* vim, abbreviated form */
627 "set tabsize=", /* will vi never die? */
628 /* more templates can be added here to support other editors */
629 };
630 char cbuf[80];
631 char *tp, **cp;
632 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000633 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +0000634 *tp++ = c = tok_nextc(tok);
635 } while (c != EOF && c != '\n' &&
636 tp - cbuf + 1 < sizeof(cbuf));
637 *tp = '\0';
638 for (cp = tabforms;
639 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
640 cp++) {
641 if ((tp = strstr(cbuf, *cp))) {
642 int newsize = atoi(tp + strlen(*cp));
643
644 if (newsize >= 1 && newsize <= 40) {
645 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +0000646 if (Py_VerboseFlag)
647 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +0000648 "Tab size set to %d\n",
649 newsize);
650 }
651 }
652 }
653 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000654 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655 }
656
657 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000658 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000660 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661
662 /* Identifier (most frequent token!) */
663 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +0000664 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +0000665 switch (c) {
666 case 'r':
667 case 'R':
668 c = tok_nextc(tok);
669 if (c == '"' || c == '\'')
670 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +0000671 break;
672 case 'u':
673 case 'U':
674 c = tok_nextc(tok);
675 if (c == 'r' || c == 'R')
676 c = tok_nextc(tok);
677 if (c == '"' || c == '\'')
678 goto letter_quote;
679 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +0000680 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000681 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000683 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000685 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686 *p_end = tok->cur;
687 return NAME;
688 }
689
690 /* Newline */
691 if (c == '\n') {
692 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000693 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000694 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000695 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000696 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
697 return NEWLINE;
698 }
699
Guido van Rossum2d45be11997-04-11 19:16:25 +0000700#ifdef macintosh
701 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000702 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +0000703 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000704 tok->done = E_TOKEN;
705 tok->cur = tok->inp;
706 return ERRORTOKEN;
707 }
708#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000709 /* Period or number starting with period? */
710 if (c == '.') {
711 c = tok_nextc(tok);
712 if (isdigit(c)) {
713 goto fraction;
714 }
715 else {
716 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000717 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000718 *p_end = tok->cur;
719 return DOT;
720 }
721 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000722
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723 /* Number */
724 if (isdigit(c)) {
725 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +0000726 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727 c = tok_nextc(tok);
728 if (c == '.')
729 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000730#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000731 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000732 goto imaginary;
733#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000734 if (c == 'x' || c == 'X') {
735 /* Hex */
736 do {
737 c = tok_nextc(tok);
738 } while (isxdigit(c));
739 }
740 else {
Tim Petersd507dab2001-08-30 20:51:59 +0000741 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 /* Octal; c is first char of it */
743 /* There's no 'isoctdigit' macro, sigh */
744 while ('0' <= c && c < '8') {
745 c = tok_nextc(tok);
746 }
Tim Petersd507dab2001-08-30 20:51:59 +0000747 if (isdigit(c)) {
748 found_decimal = 1;
749 do {
750 c = tok_nextc(tok);
751 } while (isdigit(c));
752 }
753 if (c == '.')
754 goto fraction;
755 else if (c == 'e' || c == 'E')
756 goto exponent;
757#ifndef WITHOUT_COMPLEX
758 else if (c == 'j' || c == 'J')
759 goto imaginary;
760#endif
761 else if (found_decimal) {
762 tok->done = E_TOKEN;
763 tok_backup(tok, c);
764 return ERRORTOKEN;
765 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000767 if (c == 'l' || c == 'L')
768 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 }
770 else {
771 /* Decimal */
772 do {
773 c = tok_nextc(tok);
774 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000775 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000777 else {
Tim Peters9aa70d92001-08-27 19:19:28 +0000778 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +0000779 if (c == '.') {
780 fraction:
781 /* Fraction */
782 do {
783 c = tok_nextc(tok);
784 } while (isdigit(c));
785 }
786 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +0000787 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +0000788 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000790 if (c == '+' || c == '-')
791 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +0000792 if (!isdigit(c)) {
793 tok->done = E_TOKEN;
794 tok_backup(tok, c);
795 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +0000796 }
Tim Peters9aa70d92001-08-27 19:19:28 +0000797 do {
798 c = tok_nextc(tok);
799 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000801#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000802 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000803 /* Imaginary part */
804 imaginary:
805 c = tok_nextc(tok);
806#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807 }
808 }
809 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000810 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 *p_end = tok->cur;
812 return NUMBER;
813 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000814
815 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 /* String */
817 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000818 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 int quote = c;
820 int triple = 0;
821 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822 for (;;) {
823 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 if (c == '\n') {
825 if (!triple) {
826 tok->done = E_TOKEN;
827 tok_backup(tok, c);
828 return ERRORTOKEN;
829 }
830 tripcount = 0;
831 }
832 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000833 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000834 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835 return ERRORTOKEN;
836 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000837 else if (c == quote) {
838 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000839 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 c = tok_nextc(tok);
841 if (c == quote) {
842 triple = 1;
843 tripcount = 0;
844 continue;
845 }
846 tok_backup(tok, c);
847 }
848 if (!triple || tripcount == 3)
849 break;
850 }
851 else if (c == '\\') {
852 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000855 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000856 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857 return ERRORTOKEN;
858 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 else
861 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000863 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000864 *p_end = tok->cur;
865 return STRING;
866 }
867
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868 /* Line continuation */
869 if (c == '\\') {
870 c = tok_nextc(tok);
871 if (c != '\n') {
872 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000874 return ERRORTOKEN;
875 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000876 goto again; /* Read next line */
877 }
878
Guido van Rossumfbab9051991-10-20 20:25:03 +0000879 /* Check for two-character token */
880 {
881 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000882 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000883 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +0000884 int c3 = tok_nextc(tok);
885 int token3 = PyToken_ThreeChars(c, c2, c3);
886 if (token3 != OP) {
887 token = token3;
888 } else {
889 tok_backup(tok, c3);
890 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000891 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000892 *p_end = tok->cur;
893 return token;
894 }
895 tok_backup(tok, c2);
896 }
897
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000898 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000899 switch (c) {
900 case '(':
901 case '[':
902 case '{':
903 tok->level++;
904 break;
905 case ')':
906 case ']':
907 case '}':
908 tok->level--;
909 break;
910 }
911
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000913 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000915 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916}
917
918
Guido van Rossum408027e1996-12-30 16:17:54 +0000919#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000920
921void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000922tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000923{
Guido van Rossum86bea461997-04-29 21:03:06 +0000924 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000925 if (type == NAME || type == NUMBER || type == STRING || type == OP)
926 printf("(%.*s)", (int)(end - start), start);
927}
928
929#endif