blob: 4d759d1cd97805eac4765dc88f3c70f91752b55b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
Guido van Rossumb9f8d6e1995-01-04 19:08:09 +00002Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
Guido van Rossumf70e43a1991-02-19 12:39:46 +00004
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
Guido van Rossum3f5da241990-12-20 15:06:42 +000027#include "pgenheaders.h"
28
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029#include <ctype.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000031#include "tokenizer.h"
32#include "errcode.h"
33
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034extern char *my_readline PROTO((char *));
35/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
43static struct tok_state *tok_new PROTO((void));
44static int tok_nextc PROTO((struct tok_state *tok));
45static void tok_backup PROTO((struct tok_state *tok, int c));
46
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000047/* Token names */
48
49char *tok_name[] = {
50 "ENDMARKER",
51 "NAME",
52 "NUMBER",
53 "STRING",
54 "NEWLINE",
55 "INDENT",
56 "DEDENT",
57 "LPAR",
58 "RPAR",
59 "LSQB",
60 "RSQB",
61 "COLON",
62 "COMMA",
63 "SEMI",
64 "PLUS",
65 "MINUS",
66 "STAR",
67 "SLASH",
68 "VBAR",
69 "AMPER",
70 "LESS",
71 "GREATER",
72 "EQUAL",
73 "DOT",
74 "PERCENT",
75 "BACKQUOTE",
76 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000087 "OP",
88 "<ERRORTOKEN>",
89 "<N_TOKENS>"
90};
91
92
93/* Create and initialize a new tok_state structure */
94
95static struct tok_state *
96tok_new()
97{
98 struct tok_state *tok = NEW(struct tok_state, 1);
99 if (tok == NULL)
100 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000101 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000102 tok->done = E_OK;
103 tok->fp = NULL;
104 tok->tabsize = TABSIZE;
105 tok->indent = 0;
106 tok->indstack[0] = 0;
107 tok->atbol = 1;
108 tok->pendin = 0;
109 tok->prompt = tok->nextprompt = NULL;
110 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000111 tok->level = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
119tok_setups(str)
120 char *str;
121{
122 struct tok_state *tok = tok_new();
123 if (tok == NULL)
124 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000125 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000126 return tok;
127}
128
129
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000130/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131
132struct tok_state *
133tok_setupf(fp, ps1, ps2)
134 FILE *fp;
135 char *ps1, *ps2;
136{
137 struct tok_state *tok = tok_new();
138 if (tok == NULL)
139 return NULL;
140 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
141 DEL(tok);
142 return NULL;
143 }
144 tok->cur = tok->inp = tok->buf;
145 tok->end = tok->buf + BUFSIZ;
146 tok->fp = fp;
147 tok->prompt = ps1;
148 tok->nextprompt = ps2;
149 return tok;
150}
151
152
153/* Free a tok_state structure */
154
155void
156tok_free(tok)
157 struct tok_state *tok;
158{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159 if (tok->fp != NULL && tok->buf != NULL)
160 DEL(tok->buf);
161 DEL(tok);
162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
168tok_nextc(tok)
169 register struct tok_state *tok;
170{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000171 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000172 if (tok->cur != tok->inp) {
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000173 return *tok->cur++; /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000174 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000175 if (tok->done != E_OK)
176 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000177 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000178 char *end = strchr(tok->inp, '\n');
179 if (end != NULL)
180 end++;
181 else {
182 end = strchr(tok->inp, '\0');
183 if (end == tok->inp) {
184 tok->done = E_EOF;
185 return EOF;
186 }
187 }
188 if (tok->start == NULL)
189 tok->buf = tok->cur;
190 tok->lineno++;
191 tok->inp = end;
192 return *tok->cur++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000194 if (tok->prompt != NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000195 char *new = my_readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000196 if (tok->nextprompt != NULL)
197 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000198 if (new == NULL)
199 tok->done = E_INTR;
200 else if (*new == '\0') {
201 free(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000202 tok->done = E_EOF;
203 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000204 else if (tok->start != NULL) {
205 int start = tok->start - tok->buf;
206 int oldlen = tok->cur - tok->buf;
207 int newlen = oldlen + strlen(new);
208 char *buf = realloc(tok->buf, newlen+1);
209 tok->lineno++;
210 if (buf == NULL) {
211 free(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000212 tok->buf = NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000213 free(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000214 tok->done = E_NOMEM;
215 return EOF;
216 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000217 tok->buf = buf;
218 tok->cur = tok->buf + oldlen;
219 strcpy(tok->buf + oldlen, new);
220 free(new);
221 tok->inp = tok->buf + newlen;
222 tok->end = tok->inp + 1;
223 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000224 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000225 else {
226 tok->lineno++;
227 if (tok->buf != NULL)
228 free(tok->buf);
229 tok->buf = new;
230 tok->cur = tok->buf;
231 tok->inp = strchr(tok->buf, '\0');
232 tok->end = tok->inp + 1;
233 }
234 }
235 else {
236 int done = 0;
237 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000238 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000239 if (tok->start == NULL) {
240 if (tok->buf == NULL) {
241 tok->buf = NEW(char, BUFSIZ);
242 if (tok->buf == NULL) {
243 tok->done = E_NOMEM;
244 return EOF;
245 }
246 tok->end = tok->buf + BUFSIZ;
247 }
248 if (fgets(tok->buf, (int)(tok->end - tok->buf),
249 tok->fp) == NULL) {
250 tok->done = E_EOF;
251 done = 1;
252 }
253 else {
254 tok->done = E_OK;
255 tok->inp = strchr(tok->buf, '\0');
256 done = tok->inp[-1] == '\n';
257 }
258 }
259 else {
260 cur = tok->cur - tok->buf;
Guido van Rossum78c05351995-01-17 16:12:13 +0000261 if (feof(tok->fp)) {
262 tok->done = E_EOF;
263 done = 1;
264 }
265 else
266 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000267 }
268 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000269 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000270 while (!done) {
271 int curstart = tok->start == NULL ? -1 :
272 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000273 int curvalid = tok->inp - tok->buf;
274 int cursize = tok->end - tok->buf;
275 int newsize = cursize + BUFSIZ;
276 char *newbuf = tok->buf;
277 RESIZE(newbuf, char, newsize);
278 if (newbuf == NULL) {
279 tok->done = E_NOMEM;
280 tok->cur = tok->inp;
281 return EOF;
282 }
283 tok->buf = newbuf;
284 tok->inp = tok->buf + curvalid;
285 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000286 tok->start = curstart < 0 ? NULL :
287 tok->buf + curstart;
288 if (fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000289 (int)(tok->end - tok->inp),
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000290 tok->fp) == NULL) {
291 /* Last line does not end in \n,
292 fake one */
293 strcpy(tok->inp, "\n");
294 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000295 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000296 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000297 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000298 tok->cur = tok->buf + cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000299 /* replace "\r\n" with "\n" */
300 pt = tok->inp - 2;
301 if (pt >= tok->buf && *pt == '\r') {
302 *pt++ = '\n';
303 *pt = '\0';
304 tok->inp = pt;
305 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000306 }
307 if (tok->done != E_OK) {
308 if (tok->prompt != NULL)
309 fprintf(stderr, "\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000310 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000311 return EOF;
312 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000313 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000314 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000315}
316
317
318/* Back-up one character */
319
320static void
321tok_backup(tok, c)
322 register struct tok_state *tok;
323 register int c;
324{
325 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000326 if (--tok->cur < tok->buf)
327 fatal("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000328 if (*tok->cur != c)
329 *tok->cur = c;
330 }
331}
332
333
334/* Return the token corresponding to a single character */
335
336int
337tok_1char(c)
338 int c;
339{
340 switch (c) {
341 case '(': return LPAR;
342 case ')': return RPAR;
343 case '[': return LSQB;
344 case ']': return RSQB;
345 case ':': return COLON;
346 case ',': return COMMA;
347 case ';': return SEMI;
348 case '+': return PLUS;
349 case '-': return MINUS;
350 case '*': return STAR;
351 case '/': return SLASH;
352 case '|': return VBAR;
353 case '&': return AMPER;
354 case '<': return LESS;
355 case '>': return GREATER;
356 case '=': return EQUAL;
357 case '.': return DOT;
358 case '%': return PERCENT;
359 case '`': return BACKQUOTE;
360 case '{': return LBRACE;
361 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000362 case '^': return CIRCUMFLEX;
363 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000364 default: return OP;
365 }
366}
367
368
Guido van Rossumfbab9051991-10-20 20:25:03 +0000369int
370tok_2char(c1, c2)
371 int c1, c2;
372{
373 switch (c1) {
374 case '=':
375 switch (c2) {
376 case '=': return EQEQUAL;
377 }
378 break;
379 case '!':
380 switch (c2) {
381 case '=': return NOTEQUAL;
382 }
383 break;
384 case '<':
385 switch (c2) {
386 case '>': return NOTEQUAL;
387 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000388 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000389 }
390 break;
391 case '>':
392 switch (c2) {
393 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000394 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000395 }
396 break;
397 }
398 return OP;
399}
400
401
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000402/* Get next token, after space stripping etc. */
403
404int
405tok_get(tok, p_start, p_end)
406 register struct tok_state *tok; /* In/out: tokenizer state */
407 char **p_start, **p_end; /* Out: point to start/end of token */
408{
409 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000410 int blankline;
411
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000412 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000413 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000414 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000415 blankline = 0;
416
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000417 /* Get indentation level */
418 if (tok->atbol) {
419 register int col = 0;
420 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000421 for (;;) {
422 c = tok_nextc(tok);
423 if (c == ' ')
424 col++;
425 else if (c == '\t')
426 col = (col/tok->tabsize + 1) * tok->tabsize;
427 else
428 break;
429 }
430 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000431 if (c == '#' || c == '\n') {
432 /* Lines with only whitespace and/or comments
433 shouldn't affect the indentation and are
434 not passed to the parser as NEWLINE tokens,
435 except *totally* empty lines in interactive
436 mode, which signal the end of a command group. */
437 if (col == 0 && c == '\n' && tok->prompt != NULL)
438 blankline = 0; /* Let it through */
439 else
440 blankline = 1; /* Ignore completely */
441 /* We can't jump back right here since we still
442 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000443 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000444 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000445 if (col == tok->indstack[tok->indent]) {
446 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000447 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000448 else if (col > tok->indstack[tok->indent]) {
449 /* Indent -- always one */
450 if (tok->indent+1 >= MAXINDENT) {
451 fprintf(stderr, "excessive indent\n");
452 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000453 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000454 return ERRORTOKEN;
455 }
456 tok->pendin++;
457 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000458 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000459 else /* col < tok->indstack[tok->indent] */ {
460 /* Dedent -- any number, must be consistent */
461 while (tok->indent > 0 &&
462 col < tok->indstack[tok->indent]) {
463 tok->indent--;
464 tok->pendin--;
465 }
466 if (col != tok->indstack[tok->indent]) {
467 fprintf(stderr, "inconsistent dedent\n");
468 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000469 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000470 return ERRORTOKEN;
471 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000472 }
473 }
474 }
475
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000476 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000477
478 /* Return pending indents/dedents */
479 if (tok->pendin != 0) {
480 if (tok->pendin < 0) {
481 tok->pendin++;
482 return DEDENT;
483 }
484 else {
485 tok->pendin--;
486 return INDENT;
487 }
488 }
489
490 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000491 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000492 /* Skip spaces */
493 do {
494 c = tok_nextc(tok);
495 } while (c == ' ' || c == '\t');
496
497 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000498 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000499
500 /* Skip comment */
501 if (c == '#') {
502 /* Hack to allow overriding the tabsize in the file.
503 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000504 beginning or end of the file. (Will vi never die...?)
505 For Python it must be at the beginning of the file! */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000506 /* XXX The real vi syntax is actually different :-( */
507 /* XXX Should recognize Emacs syntax, too */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000508 int x;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000509 if (sscanf(tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000510 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000511 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000512 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000513 tok->tabsize = x;
514 }
515 do {
516 c = tok_nextc(tok);
517 } while (c != EOF && c != '\n');
518 }
519
520 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000521 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000522 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000523 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000524
525 /* Identifier (most frequent token!) */
526 if (isalpha(c) || c == '_') {
527 do {
528 c = tok_nextc(tok);
529 } while (isalnum(c) || c == '_');
530 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000531 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000532 *p_end = tok->cur;
533 return NAME;
534 }
535
536 /* Newline */
537 if (c == '\n') {
538 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +0000539 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000540 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000541 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000542 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
543 return NEWLINE;
544 }
545
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000546 /* Period or number starting with period? */
547 if (c == '.') {
548 c = tok_nextc(tok);
549 if (isdigit(c)) {
550 goto fraction;
551 }
552 else {
553 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000554 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000555 *p_end = tok->cur;
556 return DOT;
557 }
558 }
559
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000560 /* Number */
561 if (isdigit(c)) {
562 if (c == '0') {
563 /* Hex or octal */
564 c = tok_nextc(tok);
565 if (c == '.')
566 goto fraction;
567 if (c == 'x' || c == 'X') {
568 /* Hex */
569 do {
570 c = tok_nextc(tok);
571 } while (isxdigit(c));
572 }
573 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000574 /* XXX This is broken! E.g.,
575 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000576 /* Octal; c is first char of it */
577 /* There's no 'isoctdigit' macro, sigh */
578 while ('0' <= c && c < '8') {
579 c = tok_nextc(tok);
580 }
581 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000582 if (c == 'l' || c == 'L')
583 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000584 }
585 else {
586 /* Decimal */
587 do {
588 c = tok_nextc(tok);
589 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000590 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000591 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000592 else {
593 /* Accept floating point numbers.
594 XXX This accepts incomplete things like
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000595 XXX 12e or 1e+; worry run-time */
Guido van Rossumf023c461991-05-05 20:16:20 +0000596 if (c == '.') {
597 fraction:
598 /* Fraction */
599 do {
600 c = tok_nextc(tok);
601 } while (isdigit(c));
602 }
603 if (c == 'e' || c == 'E') {
604 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000605 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000606 if (c == '+' || c == '-')
607 c = tok_nextc(tok);
608 while (isdigit(c)) {
609 c = tok_nextc(tok);
610 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000611 }
612 }
613 }
614 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000615 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000616 *p_end = tok->cur;
617 return NUMBER;
618 }
619
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000620 /* String */
621 if (c == '\'' || c == '"') {
622 int quote = c;
623 int triple = 0;
624 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625 for (;;) {
626 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000627 if (c == '\n') {
628 if (!triple) {
629 tok->done = E_TOKEN;
630 tok_backup(tok, c);
631 return ERRORTOKEN;
632 }
633 tripcount = 0;
634 }
635 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000637 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000638 return ERRORTOKEN;
639 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000640 else if (c == quote) {
641 tripcount++;
642 if (tok->cur == tok->start+2) {
643 c = tok_nextc(tok);
644 if (c == quote) {
645 triple = 1;
646 tripcount = 0;
647 continue;
648 }
649 tok_backup(tok, c);
650 }
651 if (!triple || tripcount == 3)
652 break;
653 }
654 else if (c == '\\') {
655 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000657 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000659 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660 return ERRORTOKEN;
661 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000662 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000663 else
664 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000665 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000666 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +0000667 *p_end = tok->cur;
668 return STRING;
669 }
670
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000671 /* Line continuation */
672 if (c == '\\') {
673 c = tok_nextc(tok);
674 if (c != '\n') {
675 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000676 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677 return ERRORTOKEN;
678 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000679 goto again; /* Read next line */
680 }
681
Guido van Rossumfbab9051991-10-20 20:25:03 +0000682 /* Check for two-character token */
683 {
684 int c2 = tok_nextc(tok);
685 int token = tok_2char(c, c2);
686 if (token != OP) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000687 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000688 *p_end = tok->cur;
689 return token;
690 }
691 tok_backup(tok, c2);
692 }
693
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000694 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +0000695 switch (c) {
696 case '(':
697 case '[':
698 case '{':
699 tok->level++;
700 break;
701 case ')':
702 case ']':
703 case '}':
704 tok->level--;
705 break;
706 }
707
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000708 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000709 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000710 *p_end = tok->cur;
711 return tok_1char(c);
712}
713
714
715#ifdef DEBUG
716
717void
718tok_dump(type, start, end)
719 int type;
720 char *start, *end;
721{
722 printf("%s", tok_name[type]);
723 if (type == NAME || type == NUMBER || type == STRING || type == OP)
724 printf("(%.*s)", (int)(end - start), start);
725}
726
727#endif