blob: ce397df45783fa24adf29b16708ef2ccd7c69877 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumb9f8d6e1995-01-04 19:08:09 +00002Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00004
5 All Rights Reserved
6
Guido van Rossumd266eb41996-10-25 14:44:06 +00007Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
Guido van Rossumf70e43a1991-02-19 12:39:46 +00009provided that the above copyright notice appear in all copies and that
Guido van Rossumd266eb41996-10-25 14:44:06 +000010both that copyright notice and this permission notice appear in
Guido van Rossumf70e43a1991-02-19 12:39:46 +000011supporting documentation, and that the names of Stichting Mathematisch
Guido van Rossumd266eb41996-10-25 14:44:06 +000012Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
Guido van Rossumf70e43a1991-02-19 12:39:46 +000016
Guido van Rossumd266eb41996-10-25 14:44:06 +000017While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
Guido van Rossumf70e43a1991-02-19 12:39:46 +000029
30******************************************************************/
31
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032/* Tokenizer implementation */
33
Guido van Rossum3f5da241990-12-20 15:06:42 +000034#include "pgenheaders.h"
35
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#include "tokenizer.h"
39#include "errcode.h"
40
Guido van Rossum86bea461997-04-29 21:03:06 +000041extern char *PyOS_Readline Py_PROTO((char *));
Guido van Rossumf4b1a641994-08-29 12:43:07 +000042/* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
45
Guido van Rossum4fe87291992-02-26 15:24:44 +000046/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000047#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000049/* Convert a possibly signed character to a nonnegative int */
50/* XXX This assumes characters are 8 bits wide */
51#ifdef __CHAR_UNSIGNED__
52#define Py_CHARMASK(c) (c)
53#else
54#define Py_CHARMASK(c) ((c) & 0xff)
55#endif
56
Guido van Rossum3f5da241990-12-20 15:06:42 +000057/* Forward */
Guido van Rossum86bea461997-04-29 21:03:06 +000058static struct tok_state *tok_new Py_PROTO((void));
59static int tok_nextc Py_PROTO((struct tok_state *tok));
60static void tok_backup Py_PROTO((struct tok_state *tok, int c));
Guido van Rossum3f5da241990-12-20 15:06:42 +000061
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000062/* Token names */
63
Guido van Rossum86bea461997-04-29 21:03:06 +000064char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000065 "ENDMARKER",
66 "NAME",
67 "NUMBER",
68 "STRING",
69 "NEWLINE",
70 "INDENT",
71 "DEDENT",
72 "LPAR",
73 "RPAR",
74 "LSQB",
75 "RSQB",
76 "COLON",
77 "COMMA",
78 "SEMI",
79 "PLUS",
80 "MINUS",
81 "STAR",
82 "SLASH",
83 "VBAR",
84 "AMPER",
85 "LESS",
86 "GREATER",
87 "EQUAL",
88 "DOT",
89 "PERCENT",
90 "BACKQUOTE",
91 "LBRACE",
92 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000093 "EQEQUAL",
94 "NOTEQUAL",
95 "LESSEQUAL",
96 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000097 "TILDE",
98 "CIRCUMFLEX",
99 "LEFTSHIFT",
100 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +0000101 "DOUBLESTAR",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000102 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 "OP",
104 "<ERRORTOKEN>",
105 "<N_TOKENS>"
106};
107
108
109/* Create and initialize a new tok_state structure */
110
111static struct tok_state *
112tok_new()
113{
Guido van Rossum86bea461997-04-29 21:03:06 +0000114 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000115 if (tok == NULL)
116 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000117 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000118 tok->done = E_OK;
119 tok->fp = NULL;
120 tok->tabsize = TABSIZE;
121 tok->indent = 0;
122 tok->indstack[0] = 0;
123 tok->atbol = 1;
124 tok->pendin = 0;
125 tok->prompt = tok->nextprompt = NULL;
126 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000127 tok->level = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 return tok;
129}
130
131
132/* Set up tokenizer for string */
133
134struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000135PyTokenizer_FromString(str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 char *str;
137{
138 struct tok_state *tok = tok_new();
139 if (tok == NULL)
140 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000141 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000142 return tok;
143}
144
145
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000146/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147
148struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000149PyTokenizer_FromFile(fp, ps1, ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000150 FILE *fp;
151 char *ps1, *ps2;
152{
153 struct tok_state *tok = tok_new();
154 if (tok == NULL)
155 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000156 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
157 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000158 return NULL;
159 }
160 tok->cur = tok->inp = tok->buf;
161 tok->end = tok->buf + BUFSIZ;
162 tok->fp = fp;
163 tok->prompt = ps1;
164 tok->nextprompt = ps2;
165 return tok;
166}
167
168
169/* Free a tok_state structure */
170
171void
Guido van Rossum86bea461997-04-29 21:03:06 +0000172PyTokenizer_Free(tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000173 struct tok_state *tok;
174{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000175 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000176 PyMem_DEL(tok->buf);
177 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000178}
179
180
181/* Get next char, updating state; error code goes into tok->done */
182
183static int
184tok_nextc(tok)
185 register struct tok_state *tok;
186{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000187 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000188 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000189 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000190 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000191 if (tok->done != E_OK)
192 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000194 char *end = strchr(tok->inp, '\n');
195 if (end != NULL)
196 end++;
197 else {
198 end = strchr(tok->inp, '\0');
199 if (end == tok->inp) {
200 tok->done = E_EOF;
201 return EOF;
202 }
203 }
204 if (tok->start == NULL)
205 tok->buf = tok->cur;
206 tok->lineno++;
207 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000208 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000209 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000210 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000211 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000212 if (tok->nextprompt != NULL)
213 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000214 if (new == NULL)
215 tok->done = E_INTR;
216 else if (*new == '\0') {
217 free(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000218 tok->done = E_EOF;
219 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000220 else if (tok->start != NULL) {
221 int start = tok->start - tok->buf;
222 int oldlen = tok->cur - tok->buf;
223 int newlen = oldlen + strlen(new);
224 char *buf = realloc(tok->buf, newlen+1);
225 tok->lineno++;
226 if (buf == NULL) {
227 free(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000228 tok->buf = NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000229 free(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000230 tok->done = E_NOMEM;
231 return EOF;
232 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000233 tok->buf = buf;
234 tok->cur = tok->buf + oldlen;
235 strcpy(tok->buf + oldlen, new);
236 free(new);
237 tok->inp = tok->buf + newlen;
238 tok->end = tok->inp + 1;
239 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000240 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000241 else {
242 tok->lineno++;
243 if (tok->buf != NULL)
244 free(tok->buf);
245 tok->buf = new;
246 tok->cur = tok->buf;
247 tok->inp = strchr(tok->buf, '\0');
248 tok->end = tok->inp + 1;
249 }
250 }
251 else {
252 int done = 0;
253 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000254 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000255 if (tok->start == NULL) {
256 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000257 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000258 if (tok->buf == NULL) {
259 tok->done = E_NOMEM;
260 return EOF;
261 }
262 tok->end = tok->buf + BUFSIZ;
263 }
264 if (fgets(tok->buf, (int)(tok->end - tok->buf),
265 tok->fp) == NULL) {
266 tok->done = E_EOF;
267 done = 1;
268 }
269 else {
270 tok->done = E_OK;
271 tok->inp = strchr(tok->buf, '\0');
272 done = tok->inp[-1] == '\n';
273 }
274 }
275 else {
276 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000277 if (feof(tok->fp)) {
278 tok->done = E_EOF;
279 done = 1;
280 }
281 else
282 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000283 }
284 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000285 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000286 while (!done) {
287 int curstart = tok->start == NULL ? -1 :
288 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000289 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000290 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000291 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000292 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000293 if (newbuf == NULL) {
294 tok->done = E_NOMEM;
295 tok->cur = tok->inp;
296 return EOF;
297 }
298 tok->buf = newbuf;
299 tok->inp = tok->buf + curvalid;
300 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000301 tok->start = curstart < 0 ? NULL :
302 tok->buf + curstart;
303 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000304 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000305 tok->fp) == NULL) {
306 /* Last line does not end in \n,
307 fake one */
308 strcpy(tok->inp, "\n");
309 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000310 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000311 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000312 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000313 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000314#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000315 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000316 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000317 pt = tok->inp - 2;
318 if (pt >= tok->buf && *pt == '\r') {
319 *pt++ = '\n';
320 *pt = '\0';
321 tok->inp = pt;
322 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000323#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000324 }
325 if (tok->done != E_OK) {
326 if (tok->prompt != NULL)
327 fprintf(stderr, "\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000328 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000329 return EOF;
330 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000331 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000332 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000333}
334
335
336/* Back-up one character */
337
338static void
339tok_backup(tok, c)
340 register struct tok_state *tok;
341 register int c;
342{
343 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000344 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000345 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000346 if (*tok->cur != c)
347 *tok->cur = c;
348 }
349}
350
351
352/* Return the token corresponding to a single character */
353
354int
Guido van Rossum86bea461997-04-29 21:03:06 +0000355PyToken_OneChar(c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000356 int c;
357{
358 switch (c) {
359 case '(': return LPAR;
360 case ')': return RPAR;
361 case '[': return LSQB;
362 case ']': return RSQB;
363 case ':': return COLON;
364 case ',': return COMMA;
365 case ';': return SEMI;
366 case '+': return PLUS;
367 case '-': return MINUS;
368 case '*': return STAR;
369 case '/': return SLASH;
370 case '|': return VBAR;
371 case '&': return AMPER;
372 case '<': return LESS;
373 case '>': return GREATER;
374 case '=': return EQUAL;
375 case '.': return DOT;
376 case '%': return PERCENT;
377 case '`': return BACKQUOTE;
378 case '{': return LBRACE;
379 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000380 case '^': return CIRCUMFLEX;
381 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000382 default: return OP;
383 }
384}
385
386
Guido van Rossumfbab9051991-10-20 20:25:03 +0000387int
Guido van Rossum86bea461997-04-29 21:03:06 +0000388PyToken_TwoChars(c1, c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000389 int c1, c2;
390{
391 switch (c1) {
392 case '=':
393 switch (c2) {
394 case '=': return EQEQUAL;
395 }
396 break;
397 case '!':
398 switch (c2) {
399 case '=': return NOTEQUAL;
400 }
401 break;
402 case '<':
403 switch (c2) {
404 case '>': return NOTEQUAL;
405 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000406 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000407 }
408 break;
409 case '>':
410 switch (c2) {
411 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000412 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000413 }
414 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000415 case '*':
416 switch (c2) {
417 case '*': return DOUBLESTAR;
418 }
419 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000420 }
421 return OP;
422}
423
424
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000425/* Get next token, after space stripping etc. */
426
427int
Guido van Rossum86bea461997-04-29 21:03:06 +0000428PyTokenizer_Get(tok, p_start, p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000429 register struct tok_state *tok; /* In/out: tokenizer state */
430 char **p_start, **p_end; /* Out: point to start/end of token */
431{
432 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000433 int blankline;
434
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000435 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000436 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000437 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000438 blankline = 0;
439
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000440 /* Get indentation level */
441 if (tok->atbol) {
442 register int col = 0;
443 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000444 for (;;) {
445 c = tok_nextc(tok);
446 if (c == ' ')
447 col++;
448 else if (c == '\t')
449 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum94d32b11995-07-07 22:27:27 +0000450 else if (c == '\014') /* Control-L (formfeed) */
451 col = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000452 else
453 break;
454 }
455 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000456 if (c == '#' || c == '\n') {
457 /* Lines with only whitespace and/or comments
458 shouldn't affect the indentation and are
459 not passed to the parser as NEWLINE tokens,
460 except *totally* empty lines in interactive
461 mode, which signal the end of a command group. */
462 if (col == 0 && c == '\n' && tok->prompt != NULL)
463 blankline = 0; /* Let it through */
464 else
465 blankline = 1; /* Ignore completely */
466 /* We can't jump back right here since we still
467 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000468 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000469 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000470 if (col == tok->indstack[tok->indent]) {
471 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000472 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000473 else if (col > tok->indstack[tok->indent]) {
474 /* Indent -- always one */
475 if (tok->indent+1 >= MAXINDENT) {
476 fprintf(stderr, "excessive indent\n");
477 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000478 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000479 return ERRORTOKEN;
480 }
481 tok->pendin++;
482 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000483 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000484 else /* col < tok->indstack[tok->indent] */ {
485 /* Dedent -- any number, must be consistent */
486 while (tok->indent > 0 &&
487 col < tok->indstack[tok->indent]) {
488 tok->indent--;
489 tok->pendin--;
490 }
491 if (col != tok->indstack[tok->indent]) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000492 fprintf(stderr,
493 "inconsistent dedent\n");
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000494 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000495 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000496 return ERRORTOKEN;
497 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000498 }
499 }
500 }
501
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000502 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000503
504 /* Return pending indents/dedents */
505 if (tok->pendin != 0) {
506 if (tok->pendin < 0) {
507 tok->pendin++;
508 return DEDENT;
509 }
510 else {
511 tok->pendin--;
512 return INDENT;
513 }
514 }
515
516 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000517 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000518 /* Skip spaces */
519 do {
520 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000521 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000522
523 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000524 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000525
526 /* Skip comment */
527 if (c == '#') {
528 /* Hack to allow overriding the tabsize in the file.
529 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000530 beginning or end of the file. (Will vi never die...?)
531 For Python it must be at the beginning of the file! */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000532 /* XXX The real vi syntax is actually different :-( */
533 /* XXX Should recognize Emacs syntax, too */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000534 int x;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000535 if (sscanf(tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000536 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000537 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000538 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000539 tok->tabsize = x;
540 }
541 do {
542 c = tok_nextc(tok);
543 } while (c != EOF && c != '\n');
544 }
545
546 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000547 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000548 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000549 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000550
551 /* Identifier (most frequent token!) */
552 if (isalpha(c) || c == '_') {
Guido van Rossum5026cb41997-04-25 17:32:00 +0000553 switch (c) {
554 case 'r':
555 case 'R':
556 c = tok_nextc(tok);
557 if (c == '"' || c == '\'')
558 goto letter_quote;
559 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000560 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000561 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000562 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000563 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000564 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000565 *p_end = tok->cur;
566 return NAME;
567 }
568
569 /* Newline */
570 if (c == '\n') {
571 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000572 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000573 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000574 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000575 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
576 return NEWLINE;
577 }
578
Guido van Rossum2d45be11997-04-11 19:16:25 +0000579#ifdef macintosh
580 if (c == '\r') {
Guido van Rossum86bea461997-04-29 21:03:06 +0000581 fprintf(stderr,
582 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000583 tok->done = E_TOKEN;
584 tok->cur = tok->inp;
585 return ERRORTOKEN;
586 }
587#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000588 /* Period or number starting with period? */
589 if (c == '.') {
590 c = tok_nextc(tok);
591 if (isdigit(c)) {
592 goto fraction;
593 }
594 else {
595 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000596 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000597 *p_end = tok->cur;
598 return DOT;
599 }
600 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000601
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000602 /* Number */
603 if (isdigit(c)) {
604 if (c == '0') {
605 /* Hex or octal */
606 c = tok_nextc(tok);
607 if (c == '.')
608 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000609#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000610 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000611 goto imaginary;
612#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000613 if (c == 'x' || c == 'X') {
614 /* Hex */
615 do {
616 c = tok_nextc(tok);
617 } while (isxdigit(c));
618 }
619 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000620 /* XXX This is broken! E.g.,
621 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622 /* Octal; c is first char of it */
623 /* There's no 'isoctdigit' macro, sigh */
624 while ('0' <= c && c < '8') {
625 c = tok_nextc(tok);
626 }
627 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000628 if (c == 'l' || c == 'L')
629 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630 }
631 else {
632 /* Decimal */
633 do {
634 c = tok_nextc(tok);
635 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000636 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000637 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000638 else {
639 /* Accept floating point numbers.
640 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000641 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000642 if (c == '.') {
643 fraction:
644 /* Fraction */
645 do {
646 c = tok_nextc(tok);
647 } while (isdigit(c));
648 }
649 if (c == 'e' || c == 'E') {
650 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000651 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000652 if (c == '+' || c == '-')
653 c = tok_nextc(tok);
654 while (isdigit(c)) {
655 c = tok_nextc(tok);
656 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000658#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000659 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000660 /* Imaginary part */
661 imaginary:
662 c = tok_nextc(tok);
663#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000664 }
665 }
666 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000667 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000668 *p_end = tok->cur;
669 return NUMBER;
670 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000671
672 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000673 /* String */
674 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +0000675 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000676 int quote = c;
677 int triple = 0;
678 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000679 for (;;) {
680 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000681 if (c == '\n') {
682 if (!triple) {
683 tok->done = E_TOKEN;
684 tok_backup(tok, c);
685 return ERRORTOKEN;
686 }
687 tripcount = 0;
688 }
689 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000690 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000691 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000692 return ERRORTOKEN;
693 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000694 else if (c == quote) {
695 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +0000696 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000697 c = tok_nextc(tok);
698 if (c == quote) {
699 triple = 1;
700 tripcount = 0;
701 continue;
702 }
703 tok_backup(tok, c);
704 }
705 if (!triple || tripcount == 3)
706 break;
707 }
708 else if (c == '\\') {
709 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000710 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000711 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000713 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000714 return ERRORTOKEN;
715 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000716 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000717 else
718 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000719 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000720 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000721 *p_end = tok->cur;
722 return STRING;
723 }
724
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000725 /* Line continuation */
726 if (c == '\\') {
727 c = tok_nextc(tok);
728 if (c != '\n') {
729 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000730 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731 return ERRORTOKEN;
732 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000733 goto again; /* Read next line */
734 }
735
Guido van Rossumfbab9051991-10-20 20:25:03 +0000736 /* Check for two-character token */
737 {
738 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000739 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000740 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000741 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000742 *p_end = tok->cur;
743 return token;
744 }
745 tok_backup(tok, c2);
746 }
747
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000748 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000749 switch (c) {
750 case '(':
751 case '[':
752 case '{':
753 tok->level++;
754 break;
755 case ')':
756 case ']':
757 case '}':
758 tok->level--;
759 break;
760 }
761
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000763 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000765 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766}
767
768
Guido van Rossum408027e1996-12-30 16:17:54 +0000769#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770
771void
772tok_dump(type, start, end)
773 int type;
774 char *start, *end;
775{
Guido van Rossum86bea461997-04-29 21:03:06 +0000776 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 if (type == NAME || type == NUMBER || type == STRING || type == OP)
778 printf("(%.*s)", (int)(end - start), start);
779}
780
781#endif