blob: 1070d1359d5c8b781fec4b638719cfc20c793521 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +000029/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum3f5da241990-12-20 15:06:42 +000031#include "pgenheaders.h"
32
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033#include <ctype.h>
34#include "string.h"
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#include "tokenizer.h"
38#include "errcode.h"
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
44static struct tok_state *tok_new PROTO((void));
45static int tok_nextc PROTO((struct tok_state *tok));
46static void tok_backup PROTO((struct tok_state *tok, int c));
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
50char *tok_name[] = {
51 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "BACKQUOTE",
77 "LBRACE",
78 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000079 "EQEQUAL",
80 "NOTEQUAL",
81 "LESSEQUAL",
82 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000083 "TILDE",
84 "CIRCUMFLEX",
85 "LEFTSHIFT",
86 "RIGHTSHIFT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000087 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000088 "OP",
89 "<ERRORTOKEN>",
90 "<N_TOKENS>"
91};
92
93
94/* Create and initialize a new tok_state structure */
95
96static struct tok_state *
97tok_new()
98{
99 struct tok_state *tok = NEW(struct tok_state, 1);
100 if (tok == NULL)
101 return NULL;
102 tok->buf = tok->cur = tok->end = tok->inp = NULL;
103 tok->done = E_OK;
104 tok->fp = NULL;
105 tok->tabsize = TABSIZE;
106 tok->indent = 0;
107 tok->indstack[0] = 0;
108 tok->atbol = 1;
109 tok->pendin = 0;
110 tok->prompt = tok->nextprompt = NULL;
111 tok->lineno = 0;
112 return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
119tok_setups(str)
120 char *str;
121{
122 struct tok_state *tok = tok_new();
123 if (tok == NULL)
124 return NULL;
125 tok->buf = tok->cur = str;
126 tok->end = tok->inp = strchr(str, '\0');
127 return tok;
128}
129
130
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000131/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000132
133struct tok_state *
134tok_setupf(fp, ps1, ps2)
135 FILE *fp;
136 char *ps1, *ps2;
137{
138 struct tok_state *tok = tok_new();
139 if (tok == NULL)
140 return NULL;
141 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
142 DEL(tok);
143 return NULL;
144 }
145 tok->cur = tok->inp = tok->buf;
146 tok->end = tok->buf + BUFSIZ;
147 tok->fp = fp;
148 tok->prompt = ps1;
149 tok->nextprompt = ps2;
150 return tok;
151}
152
153
154/* Free a tok_state structure */
155
156void
157tok_free(tok)
158 struct tok_state *tok;
159{
160 /* XXX really need a separate flag to say 'my buffer' */
161 if (tok->fp != NULL && tok->buf != NULL)
162 DEL(tok->buf);
163 DEL(tok);
164}
165
166
167/* Get next char, updating state; error code goes into tok->done */
168
169static int
170tok_nextc(tok)
171 register struct tok_state *tok;
172{
173 if (tok->done != E_OK)
174 return EOF;
175
176 for (;;) {
177 if (tok->cur < tok->inp)
178 return *tok->cur++;
179 if (tok->fp == NULL) {
180 tok->done = E_EOF;
181 return EOF;
182 }
183 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
184 tok->inp = tok->buf;
185 if (tok->inp == tok->end) {
186 int n = tok->end - tok->buf;
187 char *new = tok->buf;
188 RESIZE(new, char, n+n);
189 if (new == NULL) {
190 fprintf(stderr, "tokenizer out of mem\n");
191 tok->done = E_NOMEM;
192 return EOF;
193 }
194 tok->buf = new;
195 tok->inp = tok->buf + n;
196 tok->end = tok->inp + n;
197 }
198#ifdef USE_READLINE
199 if (tok->prompt != NULL) {
200 extern char *readline PROTO((char *prompt));
201 static int been_here;
202 if (!been_here) {
203 /* Force rebind of TAB to insert-tab */
204 extern int rl_insert();
205 rl_bind_key('\t', rl_insert);
206 been_here++;
207 }
208 if (tok->buf != NULL)
209 free(tok->buf);
210 tok->buf = readline(tok->prompt);
211 (void) intrcheck(); /* Clear pending interrupt */
212 if (tok->nextprompt != NULL)
213 tok->prompt = tok->nextprompt;
214 /* XXX different semantics w/o readline()! */
215 if (tok->buf == NULL) {
216 tok->done = E_EOF;
217 }
218 else {
219 unsigned int n = strlen(tok->buf);
220 if (n > 0)
221 add_history(tok->buf);
222 /* Append the '\n' that readline()
223 doesn't give us, for the tokenizer... */
224 tok->buf = realloc(tok->buf, n+2);
225 if (tok->buf == NULL)
226 tok->done = E_NOMEM;
227 else {
228 tok->end = tok->buf + n;
229 *tok->end++ = '\n';
230 *tok->end = '\0';
231 tok->inp = tok->end;
232 tok->cur = tok->buf;
233 }
234 }
235 }
236 else
237#endif
238 {
239 tok->cur = tok->inp;
240 if (tok->prompt != NULL && tok->inp == tok->buf) {
241 fprintf(stderr, "%s", tok->prompt);
242 tok->prompt = tok->nextprompt;
243 }
244 tok->done = fgets_intr(tok->inp,
245 (int)(tok->end - tok->inp), tok->fp);
246 }
247 if (tok->done != E_OK) {
248 if (tok->prompt != NULL)
249 fprintf(stderr, "\n");
250 return EOF;
251 }
252 tok->inp = strchr(tok->inp, '\0');
253 }
254}
255
256
257/* Back-up one character */
258
259static void
260tok_backup(tok, c)
261 register struct tok_state *tok;
262 register int c;
263{
264 if (c != EOF) {
265 if (--tok->cur < tok->buf) {
266 fprintf(stderr, "tok_backup: begin of buffer\n");
267 abort();
268 }
269 if (*tok->cur != c)
270 *tok->cur = c;
271 }
272}
273
274
275/* Return the token corresponding to a single character */
276
277int
278tok_1char(c)
279 int c;
280{
281 switch (c) {
282 case '(': return LPAR;
283 case ')': return RPAR;
284 case '[': return LSQB;
285 case ']': return RSQB;
286 case ':': return COLON;
287 case ',': return COMMA;
288 case ';': return SEMI;
289 case '+': return PLUS;
290 case '-': return MINUS;
291 case '*': return STAR;
292 case '/': return SLASH;
293 case '|': return VBAR;
294 case '&': return AMPER;
295 case '<': return LESS;
296 case '>': return GREATER;
297 case '=': return EQUAL;
298 case '.': return DOT;
299 case '%': return PERCENT;
300 case '`': return BACKQUOTE;
301 case '{': return LBRACE;
302 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000303 case '^': return CIRCUMFLEX;
304 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000305 default: return OP;
306 }
307}
308
309
Guido van Rossumfbab9051991-10-20 20:25:03 +0000310int
311tok_2char(c1, c2)
312 int c1, c2;
313{
314 switch (c1) {
315 case '=':
316 switch (c2) {
317 case '=': return EQEQUAL;
318 }
319 break;
320 case '!':
321 switch (c2) {
322 case '=': return NOTEQUAL;
323 }
324 break;
325 case '<':
326 switch (c2) {
327 case '>': return NOTEQUAL;
328 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000329 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000330 }
331 break;
332 case '>':
333 switch (c2) {
334 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000335 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000336 }
337 break;
338 }
339 return OP;
340}
341
342
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000343/* Get next token, after space stripping etc. */
344
345int
346tok_get(tok, p_start, p_end)
347 register struct tok_state *tok; /* In/out: tokenizer state */
348 char **p_start, **p_end; /* Out: point to start/end of token */
349{
350 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000351 int blankline;
352
353 nextline:
354 blankline = 0;
355
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000356 /* Get indentation level */
357 if (tok->atbol) {
358 register int col = 0;
359 tok->atbol = 0;
360 tok->lineno++;
361 for (;;) {
362 c = tok_nextc(tok);
363 if (c == ' ')
364 col++;
365 else if (c == '\t')
366 col = (col/tok->tabsize + 1) * tok->tabsize;
367 else
368 break;
369 }
370 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000371 if (c == '#' || c == '\n') {
372 /* Lines with only whitespace and/or comments
373 shouldn't affect the indentation and are
374 not passed to the parser as NEWLINE tokens,
375 except *totally* empty lines in interactive
376 mode, which signal the end of a command group. */
377 if (col == 0 && c == '\n' && tok->prompt != NULL)
378 blankline = 0; /* Let it through */
379 else
380 blankline = 1; /* Ignore completely */
381 /* We can't jump back right here since we still
382 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000383 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000384 if (!blankline) {
385 if (col == tok->indstack[tok->indent]) {
386 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000387 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000388 else if (col > tok->indstack[tok->indent]) {
389 /* Indent -- always one */
390 if (tok->indent+1 >= MAXINDENT) {
391 fprintf(stderr, "excessive indent\n");
392 tok->done = E_TOKEN;
393 return ERRORTOKEN;
394 }
395 tok->pendin++;
396 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000397 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000398 else /* col < tok->indstack[tok->indent] */ {
399 /* Dedent -- any number, must be consistent */
400 while (tok->indent > 0 &&
401 col < tok->indstack[tok->indent]) {
402 tok->indent--;
403 tok->pendin--;
404 }
405 if (col != tok->indstack[tok->indent]) {
406 fprintf(stderr, "inconsistent dedent\n");
407 tok->done = E_TOKEN;
408 return ERRORTOKEN;
409 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000410 }
411 }
412 }
413
414 *p_start = *p_end = tok->cur;
415
416 /* Return pending indents/dedents */
417 if (tok->pendin != 0) {
418 if (tok->pendin < 0) {
419 tok->pendin++;
420 return DEDENT;
421 }
422 else {
423 tok->pendin--;
424 return INDENT;
425 }
426 }
427
428 again:
429 /* Skip spaces */
430 do {
431 c = tok_nextc(tok);
432 } while (c == ' ' || c == '\t');
433
434 /* Set start of current token */
435 *p_start = tok->cur - 1;
436
437 /* Skip comment */
438 if (c == '#') {
439 /* Hack to allow overriding the tabsize in the file.
440 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000441 beginning or end of the file. (Will vi never die...?)
442 For Python it must be at the beginning of the file! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000443 int x;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000444 /* XXX The cast to (unsigned char *) is needed by THINK C 3.0 */
Guido van Rossumb156d721990-12-20 23:13:00 +0000445 if (sscanf(/*(unsigned char *)*/tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000446 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000447 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000448 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000449 tok->tabsize = x;
450 }
451 do {
452 c = tok_nextc(tok);
453 } while (c != EOF && c != '\n');
454 }
455
456 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000457 if (c == EOF) {
458 *p_start = *p_end = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000459 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000460 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000461
462 /* Identifier (most frequent token!) */
463 if (isalpha(c) || c == '_') {
464 do {
465 c = tok_nextc(tok);
466 } while (isalnum(c) || c == '_');
467 tok_backup(tok, c);
468 *p_end = tok->cur;
469 return NAME;
470 }
471
472 /* Newline */
473 if (c == '\n') {
474 tok->atbol = 1;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000475 if (blankline)
476 goto nextline;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000477 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
478 return NEWLINE;
479 }
480
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000481 /* Period or number starting with period? */
482 if (c == '.') {
483 c = tok_nextc(tok);
484 if (isdigit(c)) {
485 goto fraction;
486 }
487 else {
488 tok_backup(tok, c);
489 *p_end = tok->cur;
490 return DOT;
491 }
492 }
493
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000494 /* Number */
495 if (isdigit(c)) {
496 if (c == '0') {
497 /* Hex or octal */
498 c = tok_nextc(tok);
499 if (c == '.')
500 goto fraction;
501 if (c == 'x' || c == 'X') {
502 /* Hex */
503 do {
504 c = tok_nextc(tok);
505 } while (isxdigit(c));
506 }
507 else {
Guido van Rossum94309451991-12-10 14:01:05 +0000508 /* XXX This is broken! E.g.,
509 09.9 should be accepted as float! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000510 /* Octal; c is first char of it */
511 /* There's no 'isoctdigit' macro, sigh */
512 while ('0' <= c && c < '8') {
513 c = tok_nextc(tok);
514 }
515 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000516 if (c == 'l' || c == 'L')
517 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000518 }
519 else {
520 /* Decimal */
521 do {
522 c = tok_nextc(tok);
523 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000524 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000525 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000526 else {
527 /* Accept floating point numbers.
528 XXX This accepts incomplete things like
529 XXX 12e or 1e+; worry run-time.
530 XXX Doesn't accept numbers
531 XXX starting with a dot */
532 if (c == '.') {
533 fraction:
534 /* Fraction */
535 do {
536 c = tok_nextc(tok);
537 } while (isdigit(c));
538 }
539 if (c == 'e' || c == 'E') {
540 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000541 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000542 if (c == '+' || c == '-')
543 c = tok_nextc(tok);
544 while (isdigit(c)) {
545 c = tok_nextc(tok);
546 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000547 }
548 }
549 }
550 tok_backup(tok, c);
551 *p_end = tok->cur;
552 return NUMBER;
553 }
554
555 /* String */
556 if (c == '\'') {
557 for (;;) {
558 c = tok_nextc(tok);
559 if (c == '\n' || c == EOF) {
560 tok->done = E_TOKEN;
561 return ERRORTOKEN;
562 }
563 if (c == '\\') {
564 c = tok_nextc(tok);
565 *p_end = tok->cur;
566 if (c == '\n' || c == EOF) {
567 tok->done = E_TOKEN;
568 return ERRORTOKEN;
569 }
570 continue;
571 }
572 if (c == '\'')
573 break;
574 }
575 *p_end = tok->cur;
576 return STRING;
577 }
578
579 /* Line continuation */
580 if (c == '\\') {
581 c = tok_nextc(tok);
582 if (c != '\n') {
583 tok->done = E_TOKEN;
584 return ERRORTOKEN;
585 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000586 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000587 goto again; /* Read next line */
588 }
589
Guido van Rossumfbab9051991-10-20 20:25:03 +0000590 /* Check for two-character token */
591 {
592 int c2 = tok_nextc(tok);
593 int token = tok_2char(c, c2);
594 if (token != OP) {
595 *p_end = tok->cur;
596 return token;
597 }
598 tok_backup(tok, c2);
599 }
600
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000601 /* Punctuation character */
602 *p_end = tok->cur;
603 return tok_1char(c);
604}
605
606
607#ifdef DEBUG
608
609void
610tok_dump(type, start, end)
611 int type;
612 char *start, *end;
613{
614 printf("%s", tok_name[type]);
615 if (type == NAME || type == NUMBER || type == STRING || type == OP)
616 printf("(%.*s)", (int)(end - start), start);
617}
618
619#endif