blob: 3bd9530ae93bea3c08abdc95c263a754031f2598 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +000029/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum3f5da241990-12-20 15:06:42 +000031#include "pgenheaders.h"
32
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033#include <ctype.h>
34#include "string.h"
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#include "tokenizer.h"
38#include "errcode.h"
39
Guido van Rossumd6a15ad1991-06-24 22:30:42 +000040#ifdef macintosh
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
55char *tok_name[] = {
56 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
81 "BACKQUOTE",
82 "LBRACE",
83 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000084 "EQEQUAL",
85 "NOTEQUAL",
86 "LESSEQUAL",
87 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000088 "TILDE",
89 "CIRCUMFLEX",
90 "LEFTSHIFT",
91 "RIGHTSHIFT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000092 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093 "OP",
94 "<ERRORTOKEN>",
95 "<N_TOKENS>"
96};
97
98
99/* Create and initialize a new tok_state structure */
100
101static struct tok_state *
102tok_new()
103{
104 struct tok_state *tok = NEW(struct tok_state, 1);
105 if (tok == NULL)
106 return NULL;
107 tok->buf = tok->cur = tok->end = tok->inp = NULL;
108 tok->done = E_OK;
109 tok->fp = NULL;
110 tok->tabsize = TABSIZE;
111 tok->indent = 0;
112 tok->indstack[0] = 0;
113 tok->atbol = 1;
114 tok->pendin = 0;
115 tok->prompt = tok->nextprompt = NULL;
116 tok->lineno = 0;
117 return tok;
118}
119
120
121/* Set up tokenizer for string */
122
123struct tok_state *
124tok_setups(str)
125 char *str;
126{
127 struct tok_state *tok = tok_new();
128 if (tok == NULL)
129 return NULL;
130 tok->buf = tok->cur = str;
131 tok->end = tok->inp = strchr(str, '\0');
132 return tok;
133}
134
135
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000136/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137
138struct tok_state *
139tok_setupf(fp, ps1, ps2)
140 FILE *fp;
141 char *ps1, *ps2;
142{
143 struct tok_state *tok = tok_new();
144 if (tok == NULL)
145 return NULL;
146 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
147 DEL(tok);
148 return NULL;
149 }
150 tok->cur = tok->inp = tok->buf;
151 tok->end = tok->buf + BUFSIZ;
152 tok->fp = fp;
153 tok->prompt = ps1;
154 tok->nextprompt = ps2;
155 return tok;
156}
157
158
159/* Free a tok_state structure */
160
161void
162tok_free(tok)
163 struct tok_state *tok;
164{
165 /* XXX really need a separate flag to say 'my buffer' */
166 if (tok->fp != NULL && tok->buf != NULL)
167 DEL(tok->buf);
168 DEL(tok);
169}
170
171
172/* Get next char, updating state; error code goes into tok->done */
173
174static int
175tok_nextc(tok)
176 register struct tok_state *tok;
177{
178 if (tok->done != E_OK)
179 return EOF;
180
181 for (;;) {
182 if (tok->cur < tok->inp)
183 return *tok->cur++;
184 if (tok->fp == NULL) {
185 tok->done = E_EOF;
186 return EOF;
187 }
188 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
189 tok->inp = tok->buf;
190 if (tok->inp == tok->end) {
191 int n = tok->end - tok->buf;
192 char *new = tok->buf;
193 RESIZE(new, char, n+n);
194 if (new == NULL) {
195 fprintf(stderr, "tokenizer out of mem\n");
196 tok->done = E_NOMEM;
197 return EOF;
198 }
199 tok->buf = new;
200 tok->inp = tok->buf + n;
201 tok->end = tok->inp + n;
202 }
203#ifdef USE_READLINE
204 if (tok->prompt != NULL) {
205 extern char *readline PROTO((char *prompt));
206 static int been_here;
207 if (!been_here) {
208 /* Force rebind of TAB to insert-tab */
209 extern int rl_insert();
210 rl_bind_key('\t', rl_insert);
211 been_here++;
212 }
213 if (tok->buf != NULL)
214 free(tok->buf);
215 tok->buf = readline(tok->prompt);
216 (void) intrcheck(); /* Clear pending interrupt */
217 if (tok->nextprompt != NULL)
218 tok->prompt = tok->nextprompt;
219 /* XXX different semantics w/o readline()! */
220 if (tok->buf == NULL) {
221 tok->done = E_EOF;
222 }
223 else {
224 unsigned int n = strlen(tok->buf);
225 if (n > 0)
226 add_history(tok->buf);
227 /* Append the '\n' that readline()
228 doesn't give us, for the tokenizer... */
229 tok->buf = realloc(tok->buf, n+2);
230 if (tok->buf == NULL)
231 tok->done = E_NOMEM;
232 else {
233 tok->end = tok->buf + n;
234 *tok->end++ = '\n';
235 *tok->end = '\0';
236 tok->inp = tok->end;
237 tok->cur = tok->buf;
238 }
239 }
240 }
241 else
242#endif
243 {
244 tok->cur = tok->inp;
245 if (tok->prompt != NULL && tok->inp == tok->buf) {
246 fprintf(stderr, "%s", tok->prompt);
247 tok->prompt = tok->nextprompt;
248 }
249 tok->done = fgets_intr(tok->inp,
250 (int)(tok->end - tok->inp), tok->fp);
251 }
252 if (tok->done != E_OK) {
253 if (tok->prompt != NULL)
254 fprintf(stderr, "\n");
255 return EOF;
256 }
257 tok->inp = strchr(tok->inp, '\0');
258 }
259}
260
261
262/* Back-up one character */
263
264static void
265tok_backup(tok, c)
266 register struct tok_state *tok;
267 register int c;
268{
269 if (c != EOF) {
270 if (--tok->cur < tok->buf) {
271 fprintf(stderr, "tok_backup: begin of buffer\n");
272 abort();
273 }
274 if (*tok->cur != c)
275 *tok->cur = c;
276 }
277}
278
279
280/* Return the token corresponding to a single character */
281
282int
283tok_1char(c)
284 int c;
285{
286 switch (c) {
287 case '(': return LPAR;
288 case ')': return RPAR;
289 case '[': return LSQB;
290 case ']': return RSQB;
291 case ':': return COLON;
292 case ',': return COMMA;
293 case ';': return SEMI;
294 case '+': return PLUS;
295 case '-': return MINUS;
296 case '*': return STAR;
297 case '/': return SLASH;
298 case '|': return VBAR;
299 case '&': return AMPER;
300 case '<': return LESS;
301 case '>': return GREATER;
302 case '=': return EQUAL;
303 case '.': return DOT;
304 case '%': return PERCENT;
305 case '`': return BACKQUOTE;
306 case '{': return LBRACE;
307 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000308 case '^': return CIRCUMFLEX;
309 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000310 default: return OP;
311 }
312}
313
314
Guido van Rossumfbab9051991-10-20 20:25:03 +0000315int
316tok_2char(c1, c2)
317 int c1, c2;
318{
319 switch (c1) {
320 case '=':
321 switch (c2) {
322 case '=': return EQEQUAL;
323 }
324 break;
325 case '!':
326 switch (c2) {
327 case '=': return NOTEQUAL;
328 }
329 break;
330 case '<':
331 switch (c2) {
332 case '>': return NOTEQUAL;
333 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000334 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000335 }
336 break;
337 case '>':
338 switch (c2) {
339 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000340 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000341 }
342 break;
343 }
344 return OP;
345}
346
347
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000348/* Get next token, after space stripping etc. */
349
350int
351tok_get(tok, p_start, p_end)
352 register struct tok_state *tok; /* In/out: tokenizer state */
353 char **p_start, **p_end; /* Out: point to start/end of token */
354{
355 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000356 int blankline;
357
358 nextline:
359 blankline = 0;
360
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000361 /* Get indentation level */
362 if (tok->atbol) {
363 register int col = 0;
364 tok->atbol = 0;
365 tok->lineno++;
366 for (;;) {
367 c = tok_nextc(tok);
368 if (c == ' ')
369 col++;
370 else if (c == '\t')
371 col = (col/tok->tabsize + 1) * tok->tabsize;
372 else
373 break;
374 }
375 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000376 if (c == '#' || c == '\n') {
377 /* Lines with only whitespace and/or comments
378 shouldn't affect the indentation and are
379 not passed to the parser as NEWLINE tokens,
380 except *totally* empty lines in interactive
381 mode, which signal the end of a command group. */
382 if (col == 0 && c == '\n' && tok->prompt != NULL)
383 blankline = 0; /* Let it through */
384 else
385 blankline = 1; /* Ignore completely */
386 /* We can't jump back right here since we still
387 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000388 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000389 if (!blankline) {
390 if (col == tok->indstack[tok->indent]) {
391 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000392 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000393 else if (col > tok->indstack[tok->indent]) {
394 /* Indent -- always one */
395 if (tok->indent+1 >= MAXINDENT) {
396 fprintf(stderr, "excessive indent\n");
397 tok->done = E_TOKEN;
398 return ERRORTOKEN;
399 }
400 tok->pendin++;
401 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000402 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000403 else /* col < tok->indstack[tok->indent] */ {
404 /* Dedent -- any number, must be consistent */
405 while (tok->indent > 0 &&
406 col < tok->indstack[tok->indent]) {
407 tok->indent--;
408 tok->pendin--;
409 }
410 if (col != tok->indstack[tok->indent]) {
411 fprintf(stderr, "inconsistent dedent\n");
412 tok->done = E_TOKEN;
413 return ERRORTOKEN;
414 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000415 }
416 }
417 }
418
419 *p_start = *p_end = tok->cur;
420
421 /* Return pending indents/dedents */
422 if (tok->pendin != 0) {
423 if (tok->pendin < 0) {
424 tok->pendin++;
425 return DEDENT;
426 }
427 else {
428 tok->pendin--;
429 return INDENT;
430 }
431 }
432
433 again:
434 /* Skip spaces */
435 do {
436 c = tok_nextc(tok);
437 } while (c == ' ' || c == '\t');
438
439 /* Set start of current token */
440 *p_start = tok->cur - 1;
441
442 /* Skip comment */
443 if (c == '#') {
444 /* Hack to allow overriding the tabsize in the file.
445 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000446 beginning or end of the file. (Will vi never die...?)
447 For Python it must be at the beginning of the file! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000448 int x;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000449 /* XXX The cast to (unsigned char *) is needed by THINK C 3.0 */
Guido van Rossumb156d721990-12-20 23:13:00 +0000450 if (sscanf(/*(unsigned char *)*/tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000451 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000452 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000453 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000454 tok->tabsize = x;
455 }
456 do {
457 c = tok_nextc(tok);
458 } while (c != EOF && c != '\n');
459 }
460
461 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000462 if (c == EOF) {
463 *p_start = *p_end = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000464 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000465 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000466
467 /* Identifier (most frequent token!) */
468 if (isalpha(c) || c == '_') {
469 do {
470 c = tok_nextc(tok);
471 } while (isalnum(c) || c == '_');
472 tok_backup(tok, c);
473 *p_end = tok->cur;
474 return NAME;
475 }
476
477 /* Newline */
478 if (c == '\n') {
479 tok->atbol = 1;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000480 if (blankline)
481 goto nextline;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000482 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
483 return NEWLINE;
484 }
485
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000486 /* Period or number starting with period? */
487 if (c == '.') {
488 c = tok_nextc(tok);
489 if (isdigit(c)) {
490 goto fraction;
491 }
492 else {
493 tok_backup(tok, c);
494 *p_end = tok->cur;
495 return DOT;
496 }
497 }
498
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000499 /* Number */
500 if (isdigit(c)) {
501 if (c == '0') {
502 /* Hex or octal */
503 c = tok_nextc(tok);
504 if (c == '.')
505 goto fraction;
506 if (c == 'x' || c == 'X') {
507 /* Hex */
508 do {
509 c = tok_nextc(tok);
510 } while (isxdigit(c));
511 }
512 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000513 /* XXX This is broken! E.g.,
514 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000515 /* Octal; c is first char of it */
516 /* There's no 'isoctdigit' macro, sigh */
517 while ('0' <= c && c < '8') {
518 c = tok_nextc(tok);
519 }
520 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000521 if (c == 'l' || c == 'L')
522 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000523 }
524 else {
525 /* Decimal */
526 do {
527 c = tok_nextc(tok);
528 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000529 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000530 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000531 else {
532 /* Accept floating point numbers.
533 XXX This accepts incomplete things like
534 XXX 12e or 1e+; worry run-time.
535 XXX Doesn't accept numbers
536 XXX starting with a dot */
537 if (c == '.') {
538 fraction:
539 /* Fraction */
540 do {
541 c = tok_nextc(tok);
542 } while (isdigit(c));
543 }
544 if (c == 'e' || c == 'E') {
545 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000546 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000547 if (c == '+' || c == '-')
548 c = tok_nextc(tok);
549 while (isdigit(c)) {
550 c = tok_nextc(tok);
551 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000552 }
553 }
554 }
555 tok_backup(tok, c);
556 *p_end = tok->cur;
557 return NUMBER;
558 }
559
560 /* String */
561 if (c == '\'') {
562 for (;;) {
563 c = tok_nextc(tok);
564 if (c == '\n' || c == EOF) {
565 tok->done = E_TOKEN;
566 return ERRORTOKEN;
567 }
568 if (c == '\\') {
569 c = tok_nextc(tok);
570 *p_end = tok->cur;
571 if (c == '\n' || c == EOF) {
572 tok->done = E_TOKEN;
573 return ERRORTOKEN;
574 }
575 continue;
576 }
577 if (c == '\'')
578 break;
579 }
580 *p_end = tok->cur;
581 return STRING;
582 }
583
584 /* Line continuation */
585 if (c == '\\') {
586 c = tok_nextc(tok);
587 if (c != '\n') {
588 tok->done = E_TOKEN;
589 return ERRORTOKEN;
590 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000591 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000592 goto again; /* Read next line */
593 }
594
Guido van Rossumfbab9051991-10-20 20:25:03 +0000595 /* Check for two-character token */
596 {
597 int c2 = tok_nextc(tok);
598 int token = tok_2char(c, c2);
599 if (token != OP) {
600 *p_end = tok->cur;
601 return token;
602 }
603 tok_backup(tok, c2);
604 }
605
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606 /* Punctuation character */
607 *p_end = tok->cur;
608 return tok_1char(c);
609}
610
611
612#ifdef DEBUG
613
614void
615tok_dump(type, start, end)
616 int type;
617 char *start, *end;
618{
619 printf("%s", tok_name[type]);
620 if (type == NAME || type == NUMBER || type == STRING || type == OP)
621 printf("(%.*s)", (int)(end - start), start);
622}
623
624#endif