blob: ccc6d6d94fa7069988aafabcf192483d1313a5ab [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +000029/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum3f5da241990-12-20 15:06:42 +000031#include "pgenheaders.h"
32
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033#include <ctype.h>
34#include "string.h"
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#include "tokenizer.h"
38#include "errcode.h"
39
Guido van Rossumd6a15ad1991-06-24 22:30:42 +000040#ifdef macintosh
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
55char *tok_name[] = {
56 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
81 "BACKQUOTE",
82 "LBRACE",
83 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000084 "EQEQUAL",
85 "NOTEQUAL",
86 "LESSEQUAL",
87 "GREATEREQUAL",
88 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000089 "OP",
90 "<ERRORTOKEN>",
91 "<N_TOKENS>"
92};
93
94
95/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
98tok_new()
99{
100 struct tok_state *tok = NEW(struct tok_state, 1);
101 if (tok == NULL)
102 return NULL;
103 tok->buf = tok->cur = tok->end = tok->inp = NULL;
104 tok->done = E_OK;
105 tok->fp = NULL;
106 tok->tabsize = TABSIZE;
107 tok->indent = 0;
108 tok->indstack[0] = 0;
109 tok->atbol = 1;
110 tok->pendin = 0;
111 tok->prompt = tok->nextprompt = NULL;
112 tok->lineno = 0;
113 return tok;
114}
115
116
117/* Set up tokenizer for string */
118
119struct tok_state *
120tok_setups(str)
121 char *str;
122{
123 struct tok_state *tok = tok_new();
124 if (tok == NULL)
125 return NULL;
126 tok->buf = tok->cur = str;
127 tok->end = tok->inp = strchr(str, '\0');
128 return tok;
129}
130
131
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000132/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000133
134struct tok_state *
135tok_setupf(fp, ps1, ps2)
136 FILE *fp;
137 char *ps1, *ps2;
138{
139 struct tok_state *tok = tok_new();
140 if (tok == NULL)
141 return NULL;
142 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
143 DEL(tok);
144 return NULL;
145 }
146 tok->cur = tok->inp = tok->buf;
147 tok->end = tok->buf + BUFSIZ;
148 tok->fp = fp;
149 tok->prompt = ps1;
150 tok->nextprompt = ps2;
151 return tok;
152}
153
154
155/* Free a tok_state structure */
156
157void
158tok_free(tok)
159 struct tok_state *tok;
160{
161 /* XXX really need a separate flag to say 'my buffer' */
162 if (tok->fp != NULL && tok->buf != NULL)
163 DEL(tok->buf);
164 DEL(tok);
165}
166
167
168/* Get next char, updating state; error code goes into tok->done */
169
170static int
171tok_nextc(tok)
172 register struct tok_state *tok;
173{
174 if (tok->done != E_OK)
175 return EOF;
176
177 for (;;) {
178 if (tok->cur < tok->inp)
179 return *tok->cur++;
180 if (tok->fp == NULL) {
181 tok->done = E_EOF;
182 return EOF;
183 }
184 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
185 tok->inp = tok->buf;
186 if (tok->inp == tok->end) {
187 int n = tok->end - tok->buf;
188 char *new = tok->buf;
189 RESIZE(new, char, n+n);
190 if (new == NULL) {
191 fprintf(stderr, "tokenizer out of mem\n");
192 tok->done = E_NOMEM;
193 return EOF;
194 }
195 tok->buf = new;
196 tok->inp = tok->buf + n;
197 tok->end = tok->inp + n;
198 }
199#ifdef USE_READLINE
200 if (tok->prompt != NULL) {
201 extern char *readline PROTO((char *prompt));
202 static int been_here;
203 if (!been_here) {
204 /* Force rebind of TAB to insert-tab */
205 extern int rl_insert();
206 rl_bind_key('\t', rl_insert);
207 been_here++;
208 }
209 if (tok->buf != NULL)
210 free(tok->buf);
211 tok->buf = readline(tok->prompt);
212 (void) intrcheck(); /* Clear pending interrupt */
213 if (tok->nextprompt != NULL)
214 tok->prompt = tok->nextprompt;
215 /* XXX different semantics w/o readline()! */
216 if (tok->buf == NULL) {
217 tok->done = E_EOF;
218 }
219 else {
220 unsigned int n = strlen(tok->buf);
221 if (n > 0)
222 add_history(tok->buf);
223 /* Append the '\n' that readline()
224 doesn't give us, for the tokenizer... */
225 tok->buf = realloc(tok->buf, n+2);
226 if (tok->buf == NULL)
227 tok->done = E_NOMEM;
228 else {
229 tok->end = tok->buf + n;
230 *tok->end++ = '\n';
231 *tok->end = '\0';
232 tok->inp = tok->end;
233 tok->cur = tok->buf;
234 }
235 }
236 }
237 else
238#endif
239 {
240 tok->cur = tok->inp;
241 if (tok->prompt != NULL && tok->inp == tok->buf) {
242 fprintf(stderr, "%s", tok->prompt);
243 tok->prompt = tok->nextprompt;
244 }
245 tok->done = fgets_intr(tok->inp,
246 (int)(tok->end - tok->inp), tok->fp);
247 }
248 if (tok->done != E_OK) {
249 if (tok->prompt != NULL)
250 fprintf(stderr, "\n");
251 return EOF;
252 }
253 tok->inp = strchr(tok->inp, '\0');
254 }
255}
256
257
258/* Back-up one character */
259
260static void
261tok_backup(tok, c)
262 register struct tok_state *tok;
263 register int c;
264{
265 if (c != EOF) {
266 if (--tok->cur < tok->buf) {
267 fprintf(stderr, "tok_backup: begin of buffer\n");
268 abort();
269 }
270 if (*tok->cur != c)
271 *tok->cur = c;
272 }
273}
274
275
276/* Return the token corresponding to a single character */
277
278int
279tok_1char(c)
280 int c;
281{
282 switch (c) {
283 case '(': return LPAR;
284 case ')': return RPAR;
285 case '[': return LSQB;
286 case ']': return RSQB;
287 case ':': return COLON;
288 case ',': return COMMA;
289 case ';': return SEMI;
290 case '+': return PLUS;
291 case '-': return MINUS;
292 case '*': return STAR;
293 case '/': return SLASH;
294 case '|': return VBAR;
295 case '&': return AMPER;
296 case '<': return LESS;
297 case '>': return GREATER;
298 case '=': return EQUAL;
299 case '.': return DOT;
300 case '%': return PERCENT;
301 case '`': return BACKQUOTE;
302 case '{': return LBRACE;
303 case '}': return RBRACE;
304 default: return OP;
305 }
306}
307
308
Guido van Rossumfbab9051991-10-20 20:25:03 +0000309int
310tok_2char(c1, c2)
311 int c1, c2;
312{
313 switch (c1) {
314 case '=':
315 switch (c2) {
316 case '=': return EQEQUAL;
317 }
318 break;
319 case '!':
320 switch (c2) {
321 case '=': return NOTEQUAL;
322 }
323 break;
324 case '<':
325 switch (c2) {
326 case '>': return NOTEQUAL;
327 case '=': return LESSEQUAL;
328 }
329 break;
330 case '>':
331 switch (c2) {
332 case '=': return GREATEREQUAL;
333 }
334 break;
335 }
336 return OP;
337}
338
339
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000340/* Get next token, after space stripping etc. */
341
342int
343tok_get(tok, p_start, p_end)
344 register struct tok_state *tok; /* In/out: tokenizer state */
345 char **p_start, **p_end; /* Out: point to start/end of token */
346{
347 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000348 int blankline;
349
350 nextline:
351 blankline = 0;
352
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000353 /* Get indentation level */
354 if (tok->atbol) {
355 register int col = 0;
356 tok->atbol = 0;
357 tok->lineno++;
358 for (;;) {
359 c = tok_nextc(tok);
360 if (c == ' ')
361 col++;
362 else if (c == '\t')
363 col = (col/tok->tabsize + 1) * tok->tabsize;
364 else
365 break;
366 }
367 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000368 if (c == '#' || c == '\n') {
369 /* Lines with only whitespace and/or comments
370 shouldn't affect the indentation and are
371 not passed to the parser as NEWLINE tokens,
372 except *totally* empty lines in interactive
373 mode, which signal the end of a command group. */
374 if (col == 0 && c == '\n' && tok->prompt != NULL)
375 blankline = 0; /* Let it through */
376 else
377 blankline = 1; /* Ignore completely */
378 /* We can't jump back right here since we still
379 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000380 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000381 if (!blankline) {
382 if (col == tok->indstack[tok->indent]) {
383 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000384 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000385 else if (col > tok->indstack[tok->indent]) {
386 /* Indent -- always one */
387 if (tok->indent+1 >= MAXINDENT) {
388 fprintf(stderr, "excessive indent\n");
389 tok->done = E_TOKEN;
390 return ERRORTOKEN;
391 }
392 tok->pendin++;
393 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000394 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000395 else /* col < tok->indstack[tok->indent] */ {
396 /* Dedent -- any number, must be consistent */
397 while (tok->indent > 0 &&
398 col < tok->indstack[tok->indent]) {
399 tok->indent--;
400 tok->pendin--;
401 }
402 if (col != tok->indstack[tok->indent]) {
403 fprintf(stderr, "inconsistent dedent\n");
404 tok->done = E_TOKEN;
405 return ERRORTOKEN;
406 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000407 }
408 }
409 }
410
411 *p_start = *p_end = tok->cur;
412
413 /* Return pending indents/dedents */
414 if (tok->pendin != 0) {
415 if (tok->pendin < 0) {
416 tok->pendin++;
417 return DEDENT;
418 }
419 else {
420 tok->pendin--;
421 return INDENT;
422 }
423 }
424
425 again:
426 /* Skip spaces */
427 do {
428 c = tok_nextc(tok);
429 } while (c == ' ' || c == '\t');
430
431 /* Set start of current token */
432 *p_start = tok->cur - 1;
433
434 /* Skip comment */
435 if (c == '#') {
436 /* Hack to allow overriding the tabsize in the file.
437 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000438 beginning or end of the file. (Will vi never die...?)
439 For Python it must be at the beginning of the file! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000440 int x;
Guido van Rossumb156d721990-12-20 23:13:00 +0000441 /* XXX The case to (unsigned char *) is needed by THINK C 3.0 */
442 if (sscanf(/*(unsigned char *)*/tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000443 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000444 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000445 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000446 tok->tabsize = x;
447 }
448 do {
449 c = tok_nextc(tok);
450 } while (c != EOF && c != '\n');
451 }
452
453 /* Check for EOF and errors now */
454 if (c == EOF)
455 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
456
457 /* Identifier (most frequent token!) */
458 if (isalpha(c) || c == '_') {
459 do {
460 c = tok_nextc(tok);
461 } while (isalnum(c) || c == '_');
462 tok_backup(tok, c);
463 *p_end = tok->cur;
464 return NAME;
465 }
466
467 /* Newline */
468 if (c == '\n') {
469 tok->atbol = 1;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000470 if (blankline)
471 goto nextline;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000472 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
473 return NEWLINE;
474 }
475
476 /* Number */
477 if (isdigit(c)) {
478 if (c == '0') {
479 /* Hex or octal */
480 c = tok_nextc(tok);
481 if (c == '.')
482 goto fraction;
483 if (c == 'x' || c == 'X') {
484 /* Hex */
485 do {
486 c = tok_nextc(tok);
487 } while (isxdigit(c));
488 }
489 else {
490 /* Octal; c is first char of it */
491 /* There's no 'isoctdigit' macro, sigh */
492 while ('0' <= c && c < '8') {
493 c = tok_nextc(tok);
494 }
495 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000496 if (c == 'l' || c == 'L')
497 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000498 }
499 else {
500 /* Decimal */
501 do {
502 c = tok_nextc(tok);
503 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000504 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000505 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000506 else {
507 /* Accept floating point numbers.
508 XXX This accepts incomplete things like
509 XXX 12e or 1e+; worry run-time.
510 XXX Doesn't accept numbers
511 XXX starting with a dot */
512 if (c == '.') {
513 fraction:
514 /* Fraction */
515 do {
516 c = tok_nextc(tok);
517 } while (isdigit(c));
518 }
519 if (c == 'e' || c == 'E') {
520 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000521 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000522 if (c == '+' || c == '-')
523 c = tok_nextc(tok);
524 while (isdigit(c)) {
525 c = tok_nextc(tok);
526 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000527 }
528 }
529 }
530 tok_backup(tok, c);
531 *p_end = tok->cur;
532 return NUMBER;
533 }
534
535 /* String */
536 if (c == '\'') {
537 for (;;) {
538 c = tok_nextc(tok);
539 if (c == '\n' || c == EOF) {
540 tok->done = E_TOKEN;
541 return ERRORTOKEN;
542 }
543 if (c == '\\') {
544 c = tok_nextc(tok);
545 *p_end = tok->cur;
546 if (c == '\n' || c == EOF) {
547 tok->done = E_TOKEN;
548 return ERRORTOKEN;
549 }
550 continue;
551 }
552 if (c == '\'')
553 break;
554 }
555 *p_end = tok->cur;
556 return STRING;
557 }
558
559 /* Line continuation */
560 if (c == '\\') {
561 c = tok_nextc(tok);
562 if (c != '\n') {
563 tok->done = E_TOKEN;
564 return ERRORTOKEN;
565 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000566 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000567 goto again; /* Read next line */
568 }
569
Guido van Rossumfbab9051991-10-20 20:25:03 +0000570 /* Check for two-character token */
571 {
572 int c2 = tok_nextc(tok);
573 int token = tok_2char(c, c2);
574 if (token != OP) {
575 *p_end = tok->cur;
576 return token;
577 }
578 tok_backup(tok, c2);
579 }
580
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000581 /* Punctuation character */
582 *p_end = tok->cur;
583 return tok_1char(c);
584}
585
586
587#ifdef DEBUG
588
589void
590tok_dump(type, start, end)
591 int type;
592 char *start, *end;
593{
594 printf("%s", tok_name[type]);
595 if (type == NAME || type == NUMBER || type == STRING || type == OP)
596 printf("(%.*s)", (int)(end - start), start);
597}
598
599#endif