blob: 083ef0e9a2e738fe59f666dfd761185c9d961126 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumb9f8d6e1995-01-04 19:08:09 +00002Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00004
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
Guido van Rossum3f5da241990-12-20 15:06:42 +000027#include "pgenheaders.h"
28
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000031#include "tokenizer.h"
32#include "errcode.h"
33
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034extern char *my_readline PROTO((char *));
35/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
43static struct tok_state *tok_new PROTO((void));
44static int tok_nextc PROTO((struct tok_state *tok));
45static void tok_backup PROTO((struct tok_state *tok, int c));
46
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000047/* Token names */
48
49char *tok_name[] = {
50 "ENDMARKER",
51 "NAME",
52 "NUMBER",
53 "STRING",
54 "NEWLINE",
55 "INDENT",
56 "DEDENT",
57 "LPAR",
58 "RPAR",
59 "LSQB",
60 "RSQB",
61 "COLON",
62 "COMMA",
63 "SEMI",
64 "PLUS",
65 "MINUS",
66 "STAR",
67 "SLASH",
68 "VBAR",
69 "AMPER",
70 "LESS",
71 "GREATER",
72 "EQUAL",
73 "DOT",
74 "PERCENT",
75 "BACKQUOTE",
76 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000087 "OP",
88 "<ERRORTOKEN>",
89 "<N_TOKENS>"
90};
91
92
93/* Create and initialize a new tok_state structure */
94
95static struct tok_state *
96tok_new()
97{
98 struct tok_state *tok = NEW(struct tok_state, 1);
99 if (tok == NULL)
100 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000101 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000102 tok->done = E_OK;
103 tok->fp = NULL;
104 tok->tabsize = TABSIZE;
105 tok->indent = 0;
106 tok->indstack[0] = 0;
107 tok->atbol = 1;
108 tok->pendin = 0;
109 tok->prompt = tok->nextprompt = NULL;
110 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000111 tok->level = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
119tok_setups(str)
120 char *str;
121{
122 struct tok_state *tok = tok_new();
123 if (tok == NULL)
124 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000125 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000126 return tok;
127}
128
129
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000130/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131
132struct tok_state *
133tok_setupf(fp, ps1, ps2)
134 FILE *fp;
135 char *ps1, *ps2;
136{
137 struct tok_state *tok = tok_new();
138 if (tok == NULL)
139 return NULL;
140 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
141 DEL(tok);
142 return NULL;
143 }
144 tok->cur = tok->inp = tok->buf;
145 tok->end = tok->buf + BUFSIZ;
146 tok->fp = fp;
147 tok->prompt = ps1;
148 tok->nextprompt = ps2;
149 return tok;
150}
151
152
153/* Free a tok_state structure */
154
155void
156tok_free(tok)
157 struct tok_state *tok;
158{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159 if (tok->fp != NULL && tok->buf != NULL)
160 DEL(tok->buf);
161 DEL(tok);
162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
168tok_nextc(tok)
169 register struct tok_state *tok;
170{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000171 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000172 if (tok->cur != tok->inp) {
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000173 return *tok->cur++; /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000174 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000175 if (tok->done != E_OK)
176 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
186 }
187 }
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
192 return *tok->cur++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000194 if (tok->prompt != NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000195 char *new = my_readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
201 free(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 tok->done = E_EOF;
203 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 else if (tok->start != NULL) {
205 int start = tok->start - tok->buf;
206 int oldlen = tok->cur - tok->buf;
207 int newlen = oldlen + strlen(new);
208 char *buf = realloc(tok->buf, newlen+1);
209 tok->lineno++;
210 if (buf == NULL) {
211 free(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000212 tok->buf = NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000213 free(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000214 tok->done = E_NOMEM;
215 return EOF;
216 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000217 tok->buf = buf;
218 tok->cur = tok->buf + oldlen;
219 strcpy(tok->buf + oldlen, new);
220 free(new);
221 tok->inp = tok->buf + newlen;
222 tok->end = tok->inp + 1;
223 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000224 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000225 else {
226 tok->lineno++;
227 if (tok->buf != NULL)
228 free(tok->buf);
229 tok->buf = new;
230 tok->cur = tok->buf;
231 tok->inp = strchr(tok->buf, '\0');
232 tok->end = tok->inp + 1;
233 }
234 }
235 else {
236 int done = 0;
237 int cur = 0;
238 if (tok->start == NULL) {
239 if (tok->buf == NULL) {
240 tok->buf = NEW(char, BUFSIZ);
241 if (tok->buf == NULL) {
242 tok->done = E_NOMEM;
243 return EOF;
244 }
245 tok->end = tok->buf + BUFSIZ;
246 }
247 if (fgets(tok->buf, (int)(tok->end - tok->buf),
248 tok->fp) == NULL) {
249 tok->done = E_EOF;
250 done = 1;
251 }
252 else {
253 tok->done = E_OK;
254 tok->inp = strchr(tok->buf, '\0');
255 done = tok->inp[-1] == '\n';
256 }
257 }
258 else {
259 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000260 if (feof(tok->fp)) {
261 tok->done = E_EOF;
262 done = 1;
263 }
264 else
265 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000266 }
267 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000268 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000269 while (!done) {
270 int curstart = tok->start == NULL ? -1 :
271 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000272 int curvalid = tok->inp - tok->buf;
273 int cursize = tok->end - tok->buf;
274 int newsize = cursize + BUFSIZ;
275 char *newbuf = tok->buf;
276 RESIZE(newbuf, char, newsize);
277 if (newbuf == NULL) {
278 tok->done = E_NOMEM;
279 tok->cur = tok->inp;
280 return EOF;
281 }
282 tok->buf = newbuf;
283 tok->inp = tok->buf + curvalid;
284 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000285 tok->start = curstart < 0 ? NULL :
286 tok->buf + curstart;
287 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000288 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000289 tok->fp) == NULL) {
290 /* Last line does not end in \n,
291 fake one */
292 strcpy(tok->inp, "\n");
293 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000294 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000295 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000296 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000297 tok->cur = tok->buf + cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000298 }
299 if (tok->done != E_OK) {
300 if (tok->prompt != NULL)
301 fprintf(stderr, "\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000302 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000303 return EOF;
304 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000305 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000306 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000307}
308
309
310/* Back-up one character */
311
312static void
313tok_backup(tok, c)
314 register struct tok_state *tok;
315 register int c;
316{
317 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000318 if (--tok->cur < tok->buf)
319 fatal("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000320 if (*tok->cur != c)
321 *tok->cur = c;
322 }
323}
324
325
326/* Return the token corresponding to a single character */
327
328int
329tok_1char(c)
330 int c;
331{
332 switch (c) {
333 case '(': return LPAR;
334 case ')': return RPAR;
335 case '[': return LSQB;
336 case ']': return RSQB;
337 case ':': return COLON;
338 case ',': return COMMA;
339 case ';': return SEMI;
340 case '+': return PLUS;
341 case '-': return MINUS;
342 case '*': return STAR;
343 case '/': return SLASH;
344 case '|': return VBAR;
345 case '&': return AMPER;
346 case '<': return LESS;
347 case '>': return GREATER;
348 case '=': return EQUAL;
349 case '.': return DOT;
350 case '%': return PERCENT;
351 case '`': return BACKQUOTE;
352 case '{': return LBRACE;
353 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000354 case '^': return CIRCUMFLEX;
355 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000356 default: return OP;
357 }
358}
359
360
Guido van Rossumfbab9051991-10-20 20:25:03 +0000361int
362tok_2char(c1, c2)
363 int c1, c2;
364{
365 switch (c1) {
366 case '=':
367 switch (c2) {
368 case '=': return EQEQUAL;
369 }
370 break;
371 case '!':
372 switch (c2) {
373 case '=': return NOTEQUAL;
374 }
375 break;
376 case '<':
377 switch (c2) {
378 case '>': return NOTEQUAL;
379 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000380 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000381 }
382 break;
383 case '>':
384 switch (c2) {
385 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000386 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000387 }
388 break;
389 }
390 return OP;
391}
392
393
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000394/* Get next token, after space stripping etc. */
395
396int
397tok_get(tok, p_start, p_end)
398 register struct tok_state *tok; /* In/out: tokenizer state */
399 char **p_start, **p_end; /* Out: point to start/end of token */
400{
401 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000402 int blankline;
403
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000404 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000405 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000406 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000407 blankline = 0;
408
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000409 /* Get indentation level */
410 if (tok->atbol) {
411 register int col = 0;
412 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000413 for (;;) {
414 c = tok_nextc(tok);
415 if (c == ' ')
416 col++;
417 else if (c == '\t')
418 col = (col/tok->tabsize + 1) * tok->tabsize;
419 else
420 break;
421 }
422 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000423 if (c == '#' || c == '\n') {
424 /* Lines with only whitespace and/or comments
425 shouldn't affect the indentation and are
426 not passed to the parser as NEWLINE tokens,
427 except *totally* empty lines in interactive
428 mode, which signal the end of a command group. */
429 if (col == 0 && c == '\n' && tok->prompt != NULL)
430 blankline = 0; /* Let it through */
431 else
432 blankline = 1; /* Ignore completely */
433 /* We can't jump back right here since we still
434 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000435 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000436 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000437 if (col == tok->indstack[tok->indent]) {
438 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000439 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000440 else if (col > tok->indstack[tok->indent]) {
441 /* Indent -- always one */
442 if (tok->indent+1 >= MAXINDENT) {
443 fprintf(stderr, "excessive indent\n");
444 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000445 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000446 return ERRORTOKEN;
447 }
448 tok->pendin++;
449 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000450 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000451 else /* col < tok->indstack[tok->indent] */ {
452 /* Dedent -- any number, must be consistent */
453 while (tok->indent > 0 &&
454 col < tok->indstack[tok->indent]) {
455 tok->indent--;
456 tok->pendin--;
457 }
458 if (col != tok->indstack[tok->indent]) {
459 fprintf(stderr, "inconsistent dedent\n");
460 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000461 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000462 return ERRORTOKEN;
463 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000464 }
465 }
466 }
467
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000468 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000469
470 /* Return pending indents/dedents */
471 if (tok->pendin != 0) {
472 if (tok->pendin < 0) {
473 tok->pendin++;
474 return DEDENT;
475 }
476 else {
477 tok->pendin--;
478 return INDENT;
479 }
480 }
481
482 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000483 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000484 /* Skip spaces */
485 do {
486 c = tok_nextc(tok);
487 } while (c == ' ' || c == '\t');
488
489 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000490 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000491
492 /* Skip comment */
493 if (c == '#') {
494 /* Hack to allow overriding the tabsize in the file.
495 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000496 beginning or end of the file. (Will vi never die...?)
497 For Python it must be at the beginning of the file! */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000498 /* XXX The real vi syntax is actually different :-( */
499 /* XXX Should recognize Emacs syntax, too */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000500 int x;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000501 if (sscanf(tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000502 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000503 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000504 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000505 tok->tabsize = x;
506 }
507 do {
508 c = tok_nextc(tok);
509 } while (c != EOF && c != '\n');
510 }
511
512 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000513 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000514 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000515 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000516
517 /* Identifier (most frequent token!) */
518 if (isalpha(c) || c == '_') {
519 do {
520 c = tok_nextc(tok);
521 } while (isalnum(c) || c == '_');
522 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000523 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000524 *p_end = tok->cur;
525 return NAME;
526 }
527
528 /* Newline */
529 if (c == '\n') {
530 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000531 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000532 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000533 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000534 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
535 return NEWLINE;
536 }
537
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000538 /* Period or number starting with period? */
539 if (c == '.') {
540 c = tok_nextc(tok);
541 if (isdigit(c)) {
542 goto fraction;
543 }
544 else {
545 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000546 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000547 *p_end = tok->cur;
548 return DOT;
549 }
550 }
551
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000552 /* Number */
553 if (isdigit(c)) {
554 if (c == '0') {
555 /* Hex or octal */
556 c = tok_nextc(tok);
557 if (c == '.')
558 goto fraction;
559 if (c == 'x' || c == 'X') {
560 /* Hex */
561 do {
562 c = tok_nextc(tok);
563 } while (isxdigit(c));
564 }
565 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000566 /* XXX This is broken! E.g.,
567 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000568 /* Octal; c is first char of it */
569 /* There's no 'isoctdigit' macro, sigh */
570 while ('0' <= c && c < '8') {
571 c = tok_nextc(tok);
572 }
573 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000574 if (c == 'l' || c == 'L')
575 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000576 }
577 else {
578 /* Decimal */
579 do {
580 c = tok_nextc(tok);
581 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000582 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000583 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000584 else {
585 /* Accept floating point numbers.
586 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000587 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000588 if (c == '.') {
589 fraction:
590 /* Fraction */
591 do {
592 c = tok_nextc(tok);
593 } while (isdigit(c));
594 }
595 if (c == 'e' || c == 'E') {
596 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000597 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000598 if (c == '+' || c == '-')
599 c = tok_nextc(tok);
600 while (isdigit(c)) {
601 c = tok_nextc(tok);
602 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603 }
604 }
605 }
606 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000607 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608 *p_end = tok->cur;
609 return NUMBER;
610 }
611
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000612 /* String */
613 if (c == '\'' || c == '"') {
614 int quote = c;
615 int triple = 0;
616 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617 for (;;) {
618 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000619 if (c == '\n') {
620 if (!triple) {
621 tok->done = E_TOKEN;
622 tok_backup(tok, c);
623 return ERRORTOKEN;
624 }
625 tripcount = 0;
626 }
627 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000628 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000629 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630 return ERRORTOKEN;
631 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000632 else if (c == quote) {
633 tripcount++;
634 if (tok->cur == tok->start+2) {
635 c = tok_nextc(tok);
636 if (c == quote) {
637 triple = 1;
638 tripcount = 0;
639 continue;
640 }
641 tok_backup(tok, c);
642 }
643 if (!triple || tripcount == 3)
644 break;
645 }
646 else if (c == '\\') {
647 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000649 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000651 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652 return ERRORTOKEN;
653 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000654 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000655 else
656 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000658 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000659 *p_end = tok->cur;
660 return STRING;
661 }
662
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000663 /* Line continuation */
664 if (c == '\\') {
665 c = tok_nextc(tok);
666 if (c != '\n') {
667 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000668 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000669 return ERRORTOKEN;
670 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000671 goto again; /* Read next line */
672 }
673
Guido van Rossumfbab9051991-10-20 20:25:03 +0000674 /* Check for two-character token */
675 {
676 int c2 = tok_nextc(tok);
677 int token = tok_2char(c, c2);
678 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000679 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000680 *p_end = tok->cur;
681 return token;
682 }
683 tok_backup(tok, c2);
684 }
685
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000686 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000687 switch (c) {
688 case '(':
689 case '[':
690 case '{':
691 tok->level++;
692 break;
693 case ')':
694 case ']':
695 case '}':
696 tok->level--;
697 break;
698 }
699
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000700 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000701 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702 *p_end = tok->cur;
703 return tok_1char(c);
704}
705
706
707#ifdef DEBUG
708
709void
710tok_dump(type, start, end)
711 int type;
712 char *start, *end;
713{
714 printf("%s", tok_name[type]);
715 if (type == NAME || type == NUMBER || type == STRING || type == OP)
716 printf("(%.*s)", (int)(end - start), start);
717}
718
719#endif