blob: 3dc6c827999d62e1a33fcac49f43eb0be17e0b26 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumb9f8d6e1995-01-04 19:08:09 +00002Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00004
5 All Rights Reserved
6
Guido van Rossumd266eb41996-10-25 14:44:06 +00007Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
Guido van Rossumf70e43a1991-02-19 12:39:46 +00009provided that the above copyright notice appear in all copies and that
Guido van Rossumd266eb41996-10-25 14:44:06 +000010both that copyright notice and this permission notice appear in
Guido van Rossumf70e43a1991-02-19 12:39:46 +000011supporting documentation, and that the names of Stichting Mathematisch
Guido van Rossumd266eb41996-10-25 14:44:06 +000012Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
Guido van Rossumf70e43a1991-02-19 12:39:46 +000016
Guido van Rossumd266eb41996-10-25 14:44:06 +000017While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
Guido van Rossumf70e43a1991-02-19 12:39:46 +000029
30******************************************************************/
31
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000032/* Tokenizer implementation */
33
Guido van Rossum3f5da241990-12-20 15:06:42 +000034#include "pgenheaders.h"
35
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#include "tokenizer.h"
39#include "errcode.h"
40
Guido van Rossum86bea461997-04-29 21:03:06 +000041extern char *PyOS_Readline Py_PROTO((char *));
Guido van Rossumf4b1a641994-08-29 12:43:07 +000042/* Return malloc'ed string including trailing \n;
43 empty malloc'ed string for EOF;
44 NULL if interrupted */
45
Guido van Rossum4fe87291992-02-26 15:24:44 +000046/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000047#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048
Guido van Rossum3f5da241990-12-20 15:06:42 +000049/* Forward */
Guido van Rossum86bea461997-04-29 21:03:06 +000050static struct tok_state *tok_new Py_PROTO((void));
51static int tok_nextc Py_PROTO((struct tok_state *tok));
52static void tok_backup Py_PROTO((struct tok_state *tok, int c));
Guido van Rossum3f5da241990-12-20 15:06:42 +000053
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000054/* Token names */
55
Guido van Rossum86bea461997-04-29 21:03:06 +000056char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057 "ENDMARKER",
58 "NAME",
59 "NUMBER",
60 "STRING",
61 "NEWLINE",
62 "INDENT",
63 "DEDENT",
64 "LPAR",
65 "RPAR",
66 "LSQB",
67 "RSQB",
68 "COLON",
69 "COMMA",
70 "SEMI",
71 "PLUS",
72 "MINUS",
73 "STAR",
74 "SLASH",
75 "VBAR",
76 "AMPER",
77 "LESS",
78 "GREATER",
79 "EQUAL",
80 "DOT",
81 "PERCENT",
82 "BACKQUOTE",
83 "LBRACE",
84 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000085 "EQEQUAL",
86 "NOTEQUAL",
87 "LESSEQUAL",
88 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000089 "TILDE",
90 "CIRCUMFLEX",
91 "LEFTSHIFT",
92 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000093 "DOUBLESTAR",
Guido van Rossumfbab9051991-10-20 20:25:03 +000094 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095 "OP",
96 "<ERRORTOKEN>",
97 "<N_TOKENS>"
98};
99
100
101/* Create and initialize a new tok_state structure */
102
103static struct tok_state *
104tok_new()
105{
Guido van Rossum86bea461997-04-29 21:03:06 +0000106 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107 if (tok == NULL)
108 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000109 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110 tok->done = E_OK;
111 tok->fp = NULL;
112 tok->tabsize = TABSIZE;
113 tok->indent = 0;
114 tok->indstack[0] = 0;
115 tok->atbol = 1;
116 tok->pendin = 0;
117 tok->prompt = tok->nextprompt = NULL;
118 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000119 tok->level = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 return tok;
121}
122
123
124/* Set up tokenizer for string */
125
126struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000127PyTokenizer_FromString(str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 char *str;
129{
130 struct tok_state *tok = tok_new();
131 if (tok == NULL)
132 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000133 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000134 return tok;
135}
136
137
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000138/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000139
140struct tok_state *
Guido van Rossum86bea461997-04-29 21:03:06 +0000141PyTokenizer_FromFile(fp, ps1, ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000142 FILE *fp;
143 char *ps1, *ps2;
144{
145 struct tok_state *tok = tok_new();
146 if (tok == NULL)
147 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000148 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
149 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000150 return NULL;
151 }
152 tok->cur = tok->inp = tok->buf;
153 tok->end = tok->buf + BUFSIZ;
154 tok->fp = fp;
155 tok->prompt = ps1;
156 tok->nextprompt = ps2;
157 return tok;
158}
159
160
161/* Free a tok_state structure */
162
163void
Guido van Rossum86bea461997-04-29 21:03:06 +0000164PyTokenizer_Free(tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000165 struct tok_state *tok;
166{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000167 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000168 PyMem_DEL(tok->buf);
169 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000170}
171
172
173/* Get next char, updating state; error code goes into tok->done */
174
175static int
176tok_nextc(tok)
177 register struct tok_state *tok;
178{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000179 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000180 if (tok->cur != tok->inp) {
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000181 return *tok->cur++; /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000182 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000183 if (tok->done != E_OK)
184 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000185 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000186 char *end = strchr(tok->inp, '\n');
187 if (end != NULL)
188 end++;
189 else {
190 end = strchr(tok->inp, '\0');
191 if (end == tok->inp) {
192 tok->done = E_EOF;
193 return EOF;
194 }
195 }
196 if (tok->start == NULL)
197 tok->buf = tok->cur;
198 tok->lineno++;
199 tok->inp = end;
200 return *tok->cur++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000201 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000203 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000204 if (tok->nextprompt != NULL)
205 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000206 if (new == NULL)
207 tok->done = E_INTR;
208 else if (*new == '\0') {
209 free(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000210 tok->done = E_EOF;
211 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000212 else if (tok->start != NULL) {
213 int start = tok->start - tok->buf;
214 int oldlen = tok->cur - tok->buf;
215 int newlen = oldlen + strlen(new);
216 char *buf = realloc(tok->buf, newlen+1);
217 tok->lineno++;
218 if (buf == NULL) {
219 free(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000220 tok->buf = NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000221 free(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000222 tok->done = E_NOMEM;
223 return EOF;
224 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000225 tok->buf = buf;
226 tok->cur = tok->buf + oldlen;
227 strcpy(tok->buf + oldlen, new);
228 free(new);
229 tok->inp = tok->buf + newlen;
230 tok->end = tok->inp + 1;
231 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000232 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000233 else {
234 tok->lineno++;
235 if (tok->buf != NULL)
236 free(tok->buf);
237 tok->buf = new;
238 tok->cur = tok->buf;
239 tok->inp = strchr(tok->buf, '\0');
240 tok->end = tok->inp + 1;
241 }
242 }
243 else {
244 int done = 0;
245 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000246 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000247 if (tok->start == NULL) {
248 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000249 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000250 if (tok->buf == NULL) {
251 tok->done = E_NOMEM;
252 return EOF;
253 }
254 tok->end = tok->buf + BUFSIZ;
255 }
256 if (fgets(tok->buf, (int)(tok->end - tok->buf),
257 tok->fp) == NULL) {
258 tok->done = E_EOF;
259 done = 1;
260 }
261 else {
262 tok->done = E_OK;
263 tok->inp = strchr(tok->buf, '\0');
264 done = tok->inp[-1] == '\n';
265 }
266 }
267 else {
268 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000269 if (feof(tok->fp)) {
270 tok->done = E_EOF;
271 done = 1;
272 }
273 else
274 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000275 }
276 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000277 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000278 while (!done) {
279 int curstart = tok->start == NULL ? -1 :
280 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000281 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000282 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000283 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000284 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000285 if (newbuf == NULL) {
286 tok->done = E_NOMEM;
287 tok->cur = tok->inp;
288 return EOF;
289 }
290 tok->buf = newbuf;
291 tok->inp = tok->buf + curvalid;
292 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000293 tok->start = curstart < 0 ? NULL :
294 tok->buf + curstart;
295 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000296 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000297 tok->fp) == NULL) {
298 /* Last line does not end in \n,
299 fake one */
300 strcpy(tok->inp, "\n");
301 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000302 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000303 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000304 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000305 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000306#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000307 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000308 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000309 pt = tok->inp - 2;
310 if (pt >= tok->buf && *pt == '\r') {
311 *pt++ = '\n';
312 *pt = '\0';
313 tok->inp = pt;
314 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000315#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000316 }
317 if (tok->done != E_OK) {
318 if (tok->prompt != NULL)
319 fprintf(stderr, "\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000320 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000321 return EOF;
322 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000323 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000324 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000325}
326
327
328/* Back-up one character */
329
330static void
331tok_backup(tok, c)
332 register struct tok_state *tok;
333 register int c;
334{
335 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000336 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000337 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000338 if (*tok->cur != c)
339 *tok->cur = c;
340 }
341}
342
343
344/* Return the token corresponding to a single character */
345
346int
Guido van Rossum86bea461997-04-29 21:03:06 +0000347PyToken_OneChar(c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000348 int c;
349{
350 switch (c) {
351 case '(': return LPAR;
352 case ')': return RPAR;
353 case '[': return LSQB;
354 case ']': return RSQB;
355 case ':': return COLON;
356 case ',': return COMMA;
357 case ';': return SEMI;
358 case '+': return PLUS;
359 case '-': return MINUS;
360 case '*': return STAR;
361 case '/': return SLASH;
362 case '|': return VBAR;
363 case '&': return AMPER;
364 case '<': return LESS;
365 case '>': return GREATER;
366 case '=': return EQUAL;
367 case '.': return DOT;
368 case '%': return PERCENT;
369 case '`': return BACKQUOTE;
370 case '{': return LBRACE;
371 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000372 case '^': return CIRCUMFLEX;
373 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000374 default: return OP;
375 }
376}
377
378
Guido van Rossumfbab9051991-10-20 20:25:03 +0000379int
Guido van Rossum86bea461997-04-29 21:03:06 +0000380PyToken_TwoChars(c1, c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000381 int c1, c2;
382{
383 switch (c1) {
384 case '=':
385 switch (c2) {
386 case '=': return EQEQUAL;
387 }
388 break;
389 case '!':
390 switch (c2) {
391 case '=': return NOTEQUAL;
392 }
393 break;
394 case '<':
395 switch (c2) {
396 case '>': return NOTEQUAL;
397 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000398 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000399 }
400 break;
401 case '>':
402 switch (c2) {
403 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000404 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000405 }
406 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000407 case '*':
408 switch (c2) {
409 case '*': return DOUBLESTAR;
410 }
411 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000412 }
413 return OP;
414}
415
416
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000417/* Get next token, after space stripping etc. */
418
419int
Guido van Rossum86bea461997-04-29 21:03:06 +0000420PyTokenizer_Get(tok, p_start, p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000421 register struct tok_state *tok; /* In/out: tokenizer state */
422 char **p_start, **p_end; /* Out: point to start/end of token */
423{
424 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000425 int blankline;
426
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000427 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000428 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000429 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000430 blankline = 0;
431
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000432 /* Get indentation level */
433 if (tok->atbol) {
434 register int col = 0;
435 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000436 for (;;) {
437 c = tok_nextc(tok);
438 if (c == ' ')
439 col++;
440 else if (c == '\t')
441 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum94d32b11995-07-07 22:27:27 +0000442 else if (c == '\014') /* Control-L (formfeed) */
443 col = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000444 else
445 break;
446 }
447 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000448 if (c == '#' || c == '\n') {
449 /* Lines with only whitespace and/or comments
450 shouldn't affect the indentation and are
451 not passed to the parser as NEWLINE tokens,
452 except *totally* empty lines in interactive
453 mode, which signal the end of a command group. */
454 if (col == 0 && c == '\n' && tok->prompt != NULL)
455 blankline = 0; /* Let it through */
456 else
457 blankline = 1; /* Ignore completely */
458 /* We can't jump back right here since we still
459 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000460 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000461 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000462 if (col == tok->indstack[tok->indent]) {
463 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000464 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000465 else if (col > tok->indstack[tok->indent]) {
466 /* Indent -- always one */
467 if (tok->indent+1 >= MAXINDENT) {
468 fprintf(stderr, "excessive indent\n");
469 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000470 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000471 return ERRORTOKEN;
472 }
473 tok->pendin++;
474 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000475 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000476 else /* col < tok->indstack[tok->indent] */ {
477 /* Dedent -- any number, must be consistent */
478 while (tok->indent > 0 &&
479 col < tok->indstack[tok->indent]) {
480 tok->indent--;
481 tok->pendin--;
482 }
483 if (col != tok->indstack[tok->indent]) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000484 fprintf(stderr,
485 "inconsistent dedent\n");
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000486 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000487 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000488 return ERRORTOKEN;
489 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000490 }
491 }
492 }
493
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000494 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000495
496 /* Return pending indents/dedents */
497 if (tok->pendin != 0) {
498 if (tok->pendin < 0) {
499 tok->pendin++;
500 return DEDENT;
501 }
502 else {
503 tok->pendin--;
504 return INDENT;
505 }
506 }
507
508 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000509 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000510 /* Skip spaces */
511 do {
512 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +0000513 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000514
515 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000516 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000517
518 /* Skip comment */
519 if (c == '#') {
520 /* Hack to allow overriding the tabsize in the file.
521 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000522 beginning or end of the file. (Will vi never die...?)
523 For Python it must be at the beginning of the file! */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000524 /* XXX The real vi syntax is actually different :-( */
525 /* XXX Should recognize Emacs syntax, too */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000526 int x;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000527 if (sscanf(tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000528 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000529 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000530 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000531 tok->tabsize = x;
532 }
533 do {
534 c = tok_nextc(tok);
535 } while (c != EOF && c != '\n');
536 }
537
538 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000539 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000540 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000541 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000542
543 /* Identifier (most frequent token!) */
544 if (isalpha(c) || c == '_') {
Guido van Rossum5026cb41997-04-25 17:32:00 +0000545 switch (c) {
546 case 'r':
547 case 'R':
548 c = tok_nextc(tok);
549 if (c == '"' || c == '\'')
550 goto letter_quote;
551 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000552 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000553 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +0000554 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000555 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000556 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000557 *p_end = tok->cur;
558 return NAME;
559 }
560
561 /* Newline */
562 if (c == '\n') {
563 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000564 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000565 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000566 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000567 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
568 return NEWLINE;
569 }
570
Guido van Rossum2d45be11997-04-11 19:16:25 +0000571#ifdef macintosh
572 if (c == '\r') {
Guido van Rossum86bea461997-04-29 21:03:06 +0000573 fprintf(stderr,
574 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +0000575 tok->done = E_TOKEN;
576 tok->cur = tok->inp;
577 return ERRORTOKEN;
578 }
579#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000580 /* Period or number starting with period? */
581 if (c == '.') {
582 c = tok_nextc(tok);
583 if (isdigit(c)) {
584 goto fraction;
585 }
586 else {
587 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000588 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000589 *p_end = tok->cur;
590 return DOT;
591 }
592 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000593
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000594 /* Number */
595 if (isdigit(c)) {
596 if (c == '0') {
597 /* Hex or octal */
598 c = tok_nextc(tok);
599 if (c == '.')
600 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000601#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000602 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000603 goto imaginary;
604#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000605 if (c == 'x' || c == 'X') {
606 /* Hex */
607 do {
608 c = tok_nextc(tok);
609 } while (isxdigit(c));
610 }
611 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000612 /* XXX This is broken! E.g.,
613 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000614 /* Octal; c is first char of it */
615 /* There's no 'isoctdigit' macro, sigh */
616 while ('0' <= c && c < '8') {
617 c = tok_nextc(tok);
618 }
619 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000620 if (c == 'l' || c == 'L')
621 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622 }
623 else {
624 /* Decimal */
625 do {
626 c = tok_nextc(tok);
627 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000628 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000629 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000630 else {
631 /* Accept floating point numbers.
632 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000633 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000634 if (c == '.') {
635 fraction:
636 /* Fraction */
637 do {
638 c = tok_nextc(tok);
639 } while (isdigit(c));
640 }
641 if (c == 'e' || c == 'E') {
642 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000643 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000644 if (c == '+' || c == '-')
645 c = tok_nextc(tok);
646 while (isdigit(c)) {
647 c = tok_nextc(tok);
648 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649 }
Guido van Rossumf595fde1996-01-12 01:31:58 +0000650#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +0000651 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +0000652 /* Imaginary part */
653 imaginary:
654 c = tok_nextc(tok);
655#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 }
657 }
658 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000659 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660 *p_end = tok->cur;
661 return NUMBER;
662 }
Guido van Rossum24dacb31997-04-06 03:46:20 +0000663
664 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000665 /* String */
666 if (c == '\'' || c == '"') {
Guido van Rossum24dacb31997-04-06 03:46:20 +0000667 char *quote2 = tok->cur+1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000668 int quote = c;
669 int triple = 0;
670 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000671 for (;;) {
672 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000673 if (c == '\n') {
674 if (!triple) {
675 tok->done = E_TOKEN;
676 tok_backup(tok, c);
677 return ERRORTOKEN;
678 }
679 tripcount = 0;
680 }
681 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000683 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 return ERRORTOKEN;
685 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000686 else if (c == quote) {
687 tripcount++;
Guido van Rossum24dacb31997-04-06 03:46:20 +0000688 if (tok->cur == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000689 c = tok_nextc(tok);
690 if (c == quote) {
691 triple = 1;
692 tripcount = 0;
693 continue;
694 }
695 tok_backup(tok, c);
696 }
697 if (!triple || tripcount == 3)
698 break;
699 }
700 else if (c == '\\') {
701 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000703 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000704 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000705 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706 return ERRORTOKEN;
707 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000708 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000709 else
710 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000712 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000713 *p_end = tok->cur;
714 return STRING;
715 }
716
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717 /* Line continuation */
718 if (c == '\\') {
719 c = tok_nextc(tok);
720 if (c != '\n') {
721 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000722 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723 return ERRORTOKEN;
724 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000725 goto again; /* Read next line */
726 }
727
Guido van Rossumfbab9051991-10-20 20:25:03 +0000728 /* Check for two-character token */
729 {
730 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +0000731 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +0000732 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000733 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000734 *p_end = tok->cur;
735 return token;
736 }
737 tok_backup(tok, c2);
738 }
739
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000740 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000741 switch (c) {
742 case '(':
743 case '[':
744 case '{':
745 tok->level++;
746 break;
747 case ')':
748 case ']':
749 case '}':
750 tok->level--;
751 break;
752 }
753
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000755 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +0000757 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758}
759
760
Guido van Rossum408027e1996-12-30 16:17:54 +0000761#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762
763void
764tok_dump(type, start, end)
765 int type;
766 char *start, *end;
767{
Guido van Rossum86bea461997-04-29 21:03:06 +0000768 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 if (type == NAME || type == NUMBER || type == STRING || type == OP)
770 printf("(%.*s)", (int)(end - start), start);
771}
772
773#endif