blob: d737e55cbb3fa9dfd6e0e1ae8e53ceefc1901e6a [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +000029/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum3f5da241990-12-20 15:06:42 +000031#include "pgenheaders.h"
32
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033#include <ctype.h>
34#include "string.h"
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#include "tokenizer.h"
38#include "errcode.h"
39
Guido van Rossumd6a15ad1991-06-24 22:30:42 +000040#ifdef macintosh
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
55char *tok_name[] = {
56 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
81 "BACKQUOTE",
82 "LBRACE",
83 "RBRACE",
84 "OP",
85 "<ERRORTOKEN>",
86 "<N_TOKENS>"
87};
88
89
90/* Create and initialize a new tok_state structure */
91
92static struct tok_state *
93tok_new()
94{
95 struct tok_state *tok = NEW(struct tok_state, 1);
96 if (tok == NULL)
97 return NULL;
98 tok->buf = tok->cur = tok->end = tok->inp = NULL;
99 tok->done = E_OK;
100 tok->fp = NULL;
101 tok->tabsize = TABSIZE;
102 tok->indent = 0;
103 tok->indstack[0] = 0;
104 tok->atbol = 1;
105 tok->pendin = 0;
106 tok->prompt = tok->nextprompt = NULL;
107 tok->lineno = 0;
108 return tok;
109}
110
111
112/* Set up tokenizer for string */
113
114struct tok_state *
115tok_setups(str)
116 char *str;
117{
118 struct tok_state *tok = tok_new();
119 if (tok == NULL)
120 return NULL;
121 tok->buf = tok->cur = str;
122 tok->end = tok->inp = strchr(str, '\0');
123 return tok;
124}
125
126
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000127/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128
129struct tok_state *
130tok_setupf(fp, ps1, ps2)
131 FILE *fp;
132 char *ps1, *ps2;
133{
134 struct tok_state *tok = tok_new();
135 if (tok == NULL)
136 return NULL;
137 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
138 DEL(tok);
139 return NULL;
140 }
141 tok->cur = tok->inp = tok->buf;
142 tok->end = tok->buf + BUFSIZ;
143 tok->fp = fp;
144 tok->prompt = ps1;
145 tok->nextprompt = ps2;
146 return tok;
147}
148
149
150/* Free a tok_state structure */
151
152void
153tok_free(tok)
154 struct tok_state *tok;
155{
156 /* XXX really need a separate flag to say 'my buffer' */
157 if (tok->fp != NULL && tok->buf != NULL)
158 DEL(tok->buf);
159 DEL(tok);
160}
161
162
163/* Get next char, updating state; error code goes into tok->done */
164
165static int
166tok_nextc(tok)
167 register struct tok_state *tok;
168{
169 if (tok->done != E_OK)
170 return EOF;
171
172 for (;;) {
173 if (tok->cur < tok->inp)
174 return *tok->cur++;
175 if (tok->fp == NULL) {
176 tok->done = E_EOF;
177 return EOF;
178 }
179 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
180 tok->inp = tok->buf;
181 if (tok->inp == tok->end) {
182 int n = tok->end - tok->buf;
183 char *new = tok->buf;
184 RESIZE(new, char, n+n);
185 if (new == NULL) {
186 fprintf(stderr, "tokenizer out of mem\n");
187 tok->done = E_NOMEM;
188 return EOF;
189 }
190 tok->buf = new;
191 tok->inp = tok->buf + n;
192 tok->end = tok->inp + n;
193 }
194#ifdef USE_READLINE
195 if (tok->prompt != NULL) {
196 extern char *readline PROTO((char *prompt));
197 static int been_here;
198 if (!been_here) {
199 /* Force rebind of TAB to insert-tab */
200 extern int rl_insert();
201 rl_bind_key('\t', rl_insert);
202 been_here++;
203 }
204 if (tok->buf != NULL)
205 free(tok->buf);
206 tok->buf = readline(tok->prompt);
207 (void) intrcheck(); /* Clear pending interrupt */
208 if (tok->nextprompt != NULL)
209 tok->prompt = tok->nextprompt;
210 /* XXX different semantics w/o readline()! */
211 if (tok->buf == NULL) {
212 tok->done = E_EOF;
213 }
214 else {
215 unsigned int n = strlen(tok->buf);
216 if (n > 0)
217 add_history(tok->buf);
218 /* Append the '\n' that readline()
219 doesn't give us, for the tokenizer... */
220 tok->buf = realloc(tok->buf, n+2);
221 if (tok->buf == NULL)
222 tok->done = E_NOMEM;
223 else {
224 tok->end = tok->buf + n;
225 *tok->end++ = '\n';
226 *tok->end = '\0';
227 tok->inp = tok->end;
228 tok->cur = tok->buf;
229 }
230 }
231 }
232 else
233#endif
234 {
235 tok->cur = tok->inp;
236 if (tok->prompt != NULL && tok->inp == tok->buf) {
237 fprintf(stderr, "%s", tok->prompt);
238 tok->prompt = tok->nextprompt;
239 }
240 tok->done = fgets_intr(tok->inp,
241 (int)(tok->end - tok->inp), tok->fp);
242 }
243 if (tok->done != E_OK) {
244 if (tok->prompt != NULL)
245 fprintf(stderr, "\n");
246 return EOF;
247 }
248 tok->inp = strchr(tok->inp, '\0');
249 }
250}
251
252
253/* Back-up one character */
254
255static void
256tok_backup(tok, c)
257 register struct tok_state *tok;
258 register int c;
259{
260 if (c != EOF) {
261 if (--tok->cur < tok->buf) {
262 fprintf(stderr, "tok_backup: begin of buffer\n");
263 abort();
264 }
265 if (*tok->cur != c)
266 *tok->cur = c;
267 }
268}
269
270
271/* Return the token corresponding to a single character */
272
273int
274tok_1char(c)
275 int c;
276{
277 switch (c) {
278 case '(': return LPAR;
279 case ')': return RPAR;
280 case '[': return LSQB;
281 case ']': return RSQB;
282 case ':': return COLON;
283 case ',': return COMMA;
284 case ';': return SEMI;
285 case '+': return PLUS;
286 case '-': return MINUS;
287 case '*': return STAR;
288 case '/': return SLASH;
289 case '|': return VBAR;
290 case '&': return AMPER;
291 case '<': return LESS;
292 case '>': return GREATER;
293 case '=': return EQUAL;
294 case '.': return DOT;
295 case '%': return PERCENT;
296 case '`': return BACKQUOTE;
297 case '{': return LBRACE;
298 case '}': return RBRACE;
299 default: return OP;
300 }
301}
302
303
304/* Get next token, after space stripping etc. */
305
306int
307tok_get(tok, p_start, p_end)
308 register struct tok_state *tok; /* In/out: tokenizer state */
309 char **p_start, **p_end; /* Out: point to start/end of token */
310{
311 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000312 int blankline;
313
314 nextline:
315 blankline = 0;
316
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000317 /* Get indentation level */
318 if (tok->atbol) {
319 register int col = 0;
320 tok->atbol = 0;
321 tok->lineno++;
322 for (;;) {
323 c = tok_nextc(tok);
324 if (c == ' ')
325 col++;
326 else if (c == '\t')
327 col = (col/tok->tabsize + 1) * tok->tabsize;
328 else
329 break;
330 }
331 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000332 if (c == '#' || c == '\n') {
333 /* Lines with only whitespace and/or comments
334 shouldn't affect the indentation and are
335 not passed to the parser as NEWLINE tokens,
336 except *totally* empty lines in interactive
337 mode, which signal the end of a command group. */
338 if (col == 0 && c == '\n' && tok->prompt != NULL)
339 blankline = 0; /* Let it through */
340 else
341 blankline = 1; /* Ignore completely */
342 /* We can't jump back right here since we still
343 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000344 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000345 if (!blankline) {
346 if (col == tok->indstack[tok->indent]) {
347 /* No change */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000348 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000349 else if (col > tok->indstack[tok->indent]) {
350 /* Indent -- always one */
351 if (tok->indent+1 >= MAXINDENT) {
352 fprintf(stderr, "excessive indent\n");
353 tok->done = E_TOKEN;
354 return ERRORTOKEN;
355 }
356 tok->pendin++;
357 tok->indstack[++tok->indent] = col;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000358 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000359 else /* col < tok->indstack[tok->indent] */ {
360 /* Dedent -- any number, must be consistent */
361 while (tok->indent > 0 &&
362 col < tok->indstack[tok->indent]) {
363 tok->indent--;
364 tok->pendin--;
365 }
366 if (col != tok->indstack[tok->indent]) {
367 fprintf(stderr, "inconsistent dedent\n");
368 tok->done = E_TOKEN;
369 return ERRORTOKEN;
370 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000371 }
372 }
373 }
374
375 *p_start = *p_end = tok->cur;
376
377 /* Return pending indents/dedents */
378 if (tok->pendin != 0) {
379 if (tok->pendin < 0) {
380 tok->pendin++;
381 return DEDENT;
382 }
383 else {
384 tok->pendin--;
385 return INDENT;
386 }
387 }
388
389 again:
390 /* Skip spaces */
391 do {
392 c = tok_nextc(tok);
393 } while (c == ' ' || c == '\t');
394
395 /* Set start of current token */
396 *p_start = tok->cur - 1;
397
398 /* Skip comment */
399 if (c == '#') {
400 /* Hack to allow overriding the tabsize in the file.
401 This is also recognized by vi, when it occurs near the
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000402 beginning or end of the file. (Will vi never die...?)
403 For Python it must be at the beginning of the file! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000404 int x;
Guido van Rossumb156d721990-12-20 23:13:00 +0000405 /* XXX The case to (unsigned char *) is needed by THINK C 3.0 */
406 if (sscanf(/*(unsigned char *)*/tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000407 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000408 x >= 1 && x <= 40) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000409 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000410 tok->tabsize = x;
411 }
412 do {
413 c = tok_nextc(tok);
414 } while (c != EOF && c != '\n');
415 }
416
417 /* Check for EOF and errors now */
418 if (c == EOF)
419 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
420
421 /* Identifier (most frequent token!) */
422 if (isalpha(c) || c == '_') {
423 do {
424 c = tok_nextc(tok);
425 } while (isalnum(c) || c == '_');
426 tok_backup(tok, c);
427 *p_end = tok->cur;
428 return NAME;
429 }
430
431 /* Newline */
432 if (c == '\n') {
433 tok->atbol = 1;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000434 if (blankline)
435 goto nextline;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000436 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
437 return NEWLINE;
438 }
439
440 /* Number */
441 if (isdigit(c)) {
442 if (c == '0') {
443 /* Hex or octal */
444 c = tok_nextc(tok);
445 if (c == '.')
446 goto fraction;
447 if (c == 'x' || c == 'X') {
448 /* Hex */
449 do {
450 c = tok_nextc(tok);
451 } while (isxdigit(c));
452 }
453 else {
454 /* Octal; c is first char of it */
455 /* There's no 'isoctdigit' macro, sigh */
456 while ('0' <= c && c < '8') {
457 c = tok_nextc(tok);
458 }
459 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000460 if (c == 'l' || c == 'L')
461 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000462 }
463 else {
464 /* Decimal */
465 do {
466 c = tok_nextc(tok);
467 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000468 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000469 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000470 else {
471 /* Accept floating point numbers.
472 XXX This accepts incomplete things like
473 XXX 12e or 1e+; worry run-time.
474 XXX Doesn't accept numbers
475 XXX starting with a dot */
476 if (c == '.') {
477 fraction:
478 /* Fraction */
479 do {
480 c = tok_nextc(tok);
481 } while (isdigit(c));
482 }
483 if (c == 'e' || c == 'E') {
484 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000485 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000486 if (c == '+' || c == '-')
487 c = tok_nextc(tok);
488 while (isdigit(c)) {
489 c = tok_nextc(tok);
490 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000491 }
492 }
493 }
494 tok_backup(tok, c);
495 *p_end = tok->cur;
496 return NUMBER;
497 }
498
499 /* String */
500 if (c == '\'') {
501 for (;;) {
502 c = tok_nextc(tok);
503 if (c == '\n' || c == EOF) {
504 tok->done = E_TOKEN;
505 return ERRORTOKEN;
506 }
507 if (c == '\\') {
508 c = tok_nextc(tok);
509 *p_end = tok->cur;
510 if (c == '\n' || c == EOF) {
511 tok->done = E_TOKEN;
512 return ERRORTOKEN;
513 }
514 continue;
515 }
516 if (c == '\'')
517 break;
518 }
519 *p_end = tok->cur;
520 return STRING;
521 }
522
523 /* Line continuation */
524 if (c == '\\') {
525 c = tok_nextc(tok);
526 if (c != '\n') {
527 tok->done = E_TOKEN;
528 return ERRORTOKEN;
529 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000530 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000531 goto again; /* Read next line */
532 }
533
534 /* Punctuation character */
535 *p_end = tok->cur;
536 return tok_1char(c);
537}
538
539
540#ifdef DEBUG
541
542void
543tok_dump(type, start, end)
544 int type;
545 char *start, *end;
546{
547 printf("%s", tok_name[type]);
548 if (type == NAME || type == NUMBER || type == STRING || type == OP)
549 printf("(%.*s)", (int)(end - start), start);
550}
551
552#endif