blob: ccff9d11377b03517fcb38c9bfa381920216e8be [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000025/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +000029/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000030
Guido van Rossum3f5da241990-12-20 15:06:42 +000031#include "pgenheaders.h"
32
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000033#include <ctype.h>
34#include "string.h"
35
Guido van Rossum3f5da241990-12-20 15:06:42 +000036#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#include "tokenizer.h"
38#include "errcode.h"
39
Guido van Rossumd6a15ad1991-06-24 22:30:42 +000040#ifdef macintosh
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
55char *tok_name[] = {
56 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
81 "BACKQUOTE",
82 "LBRACE",
83 "RBRACE",
84 "OP",
85 "<ERRORTOKEN>",
86 "<N_TOKENS>"
87};
88
89
90/* Create and initialize a new tok_state structure */
91
92static struct tok_state *
93tok_new()
94{
95 struct tok_state *tok = NEW(struct tok_state, 1);
96 if (tok == NULL)
97 return NULL;
98 tok->buf = tok->cur = tok->end = tok->inp = NULL;
99 tok->done = E_OK;
100 tok->fp = NULL;
101 tok->tabsize = TABSIZE;
102 tok->indent = 0;
103 tok->indstack[0] = 0;
104 tok->atbol = 1;
105 tok->pendin = 0;
106 tok->prompt = tok->nextprompt = NULL;
107 tok->lineno = 0;
108 return tok;
109}
110
111
112/* Set up tokenizer for string */
113
114struct tok_state *
115tok_setups(str)
116 char *str;
117{
118 struct tok_state *tok = tok_new();
119 if (tok == NULL)
120 return NULL;
121 tok->buf = tok->cur = str;
122 tok->end = tok->inp = strchr(str, '\0');
123 return tok;
124}
125
126
127/* Set up tokenizer for string */
128
129struct tok_state *
130tok_setupf(fp, ps1, ps2)
131 FILE *fp;
132 char *ps1, *ps2;
133{
134 struct tok_state *tok = tok_new();
135 if (tok == NULL)
136 return NULL;
137 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
138 DEL(tok);
139 return NULL;
140 }
141 tok->cur = tok->inp = tok->buf;
142 tok->end = tok->buf + BUFSIZ;
143 tok->fp = fp;
144 tok->prompt = ps1;
145 tok->nextprompt = ps2;
146 return tok;
147}
148
149
150/* Free a tok_state structure */
151
152void
153tok_free(tok)
154 struct tok_state *tok;
155{
156 /* XXX really need a separate flag to say 'my buffer' */
157 if (tok->fp != NULL && tok->buf != NULL)
158 DEL(tok->buf);
159 DEL(tok);
160}
161
162
163/* Get next char, updating state; error code goes into tok->done */
164
165static int
166tok_nextc(tok)
167 register struct tok_state *tok;
168{
169 if (tok->done != E_OK)
170 return EOF;
171
172 for (;;) {
173 if (tok->cur < tok->inp)
174 return *tok->cur++;
175 if (tok->fp == NULL) {
176 tok->done = E_EOF;
177 return EOF;
178 }
179 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
180 tok->inp = tok->buf;
181 if (tok->inp == tok->end) {
182 int n = tok->end - tok->buf;
183 char *new = tok->buf;
184 RESIZE(new, char, n+n);
185 if (new == NULL) {
186 fprintf(stderr, "tokenizer out of mem\n");
187 tok->done = E_NOMEM;
188 return EOF;
189 }
190 tok->buf = new;
191 tok->inp = tok->buf + n;
192 tok->end = tok->inp + n;
193 }
194#ifdef USE_READLINE
195 if (tok->prompt != NULL) {
196 extern char *readline PROTO((char *prompt));
197 static int been_here;
198 if (!been_here) {
199 /* Force rebind of TAB to insert-tab */
200 extern int rl_insert();
201 rl_bind_key('\t', rl_insert);
202 been_here++;
203 }
204 if (tok->buf != NULL)
205 free(tok->buf);
206 tok->buf = readline(tok->prompt);
207 (void) intrcheck(); /* Clear pending interrupt */
208 if (tok->nextprompt != NULL)
209 tok->prompt = tok->nextprompt;
210 /* XXX different semantics w/o readline()! */
211 if (tok->buf == NULL) {
212 tok->done = E_EOF;
213 }
214 else {
215 unsigned int n = strlen(tok->buf);
216 if (n > 0)
217 add_history(tok->buf);
218 /* Append the '\n' that readline()
219 doesn't give us, for the tokenizer... */
220 tok->buf = realloc(tok->buf, n+2);
221 if (tok->buf == NULL)
222 tok->done = E_NOMEM;
223 else {
224 tok->end = tok->buf + n;
225 *tok->end++ = '\n';
226 *tok->end = '\0';
227 tok->inp = tok->end;
228 tok->cur = tok->buf;
229 }
230 }
231 }
232 else
233#endif
234 {
235 tok->cur = tok->inp;
236 if (tok->prompt != NULL && tok->inp == tok->buf) {
237 fprintf(stderr, "%s", tok->prompt);
238 tok->prompt = tok->nextprompt;
239 }
240 tok->done = fgets_intr(tok->inp,
241 (int)(tok->end - tok->inp), tok->fp);
242 }
243 if (tok->done != E_OK) {
244 if (tok->prompt != NULL)
245 fprintf(stderr, "\n");
246 return EOF;
247 }
248 tok->inp = strchr(tok->inp, '\0');
249 }
250}
251
252
253/* Back-up one character */
254
255static void
256tok_backup(tok, c)
257 register struct tok_state *tok;
258 register int c;
259{
260 if (c != EOF) {
261 if (--tok->cur < tok->buf) {
262 fprintf(stderr, "tok_backup: begin of buffer\n");
263 abort();
264 }
265 if (*tok->cur != c)
266 *tok->cur = c;
267 }
268}
269
270
271/* Return the token corresponding to a single character */
272
273int
274tok_1char(c)
275 int c;
276{
277 switch (c) {
278 case '(': return LPAR;
279 case ')': return RPAR;
280 case '[': return LSQB;
281 case ']': return RSQB;
282 case ':': return COLON;
283 case ',': return COMMA;
284 case ';': return SEMI;
285 case '+': return PLUS;
286 case '-': return MINUS;
287 case '*': return STAR;
288 case '/': return SLASH;
289 case '|': return VBAR;
290 case '&': return AMPER;
291 case '<': return LESS;
292 case '>': return GREATER;
293 case '=': return EQUAL;
294 case '.': return DOT;
295 case '%': return PERCENT;
296 case '`': return BACKQUOTE;
297 case '{': return LBRACE;
298 case '}': return RBRACE;
299 default: return OP;
300 }
301}
302
303
304/* Get next token, after space stripping etc. */
305
306int
307tok_get(tok, p_start, p_end)
308 register struct tok_state *tok; /* In/out: tokenizer state */
309 char **p_start, **p_end; /* Out: point to start/end of token */
310{
311 register int c;
312
313 /* Get indentation level */
314 if (tok->atbol) {
315 register int col = 0;
316 tok->atbol = 0;
317 tok->lineno++;
318 for (;;) {
319 c = tok_nextc(tok);
320 if (c == ' ')
321 col++;
322 else if (c == '\t')
323 col = (col/tok->tabsize + 1) * tok->tabsize;
324 else
325 break;
326 }
327 tok_backup(tok, c);
328 if (col == tok->indstack[tok->indent]) {
329 /* No change */
330 }
331 else if (col > tok->indstack[tok->indent]) {
332 /* Indent -- always one */
333 if (tok->indent+1 >= MAXINDENT) {
334 fprintf(stderr, "excessive indent\n");
335 tok->done = E_TOKEN;
336 return ERRORTOKEN;
337 }
338 tok->pendin++;
339 tok->indstack[++tok->indent] = col;
340 }
341 else /* col < tok->indstack[tok->indent] */ {
342 /* Dedent -- any number, must be consistent */
343 while (tok->indent > 0 &&
344 col < tok->indstack[tok->indent]) {
345 tok->indent--;
346 tok->pendin--;
347 }
348 if (col != tok->indstack[tok->indent]) {
349 fprintf(stderr, "inconsistent dedent\n");
350 tok->done = E_TOKEN;
351 return ERRORTOKEN;
352 }
353 }
354 }
355
356 *p_start = *p_end = tok->cur;
357
358 /* Return pending indents/dedents */
359 if (tok->pendin != 0) {
360 if (tok->pendin < 0) {
361 tok->pendin++;
362 return DEDENT;
363 }
364 else {
365 tok->pendin--;
366 return INDENT;
367 }
368 }
369
370 again:
371 /* Skip spaces */
372 do {
373 c = tok_nextc(tok);
374 } while (c == ' ' || c == '\t');
375
376 /* Set start of current token */
377 *p_start = tok->cur - 1;
378
379 /* Skip comment */
380 if (c == '#') {
381 /* Hack to allow overriding the tabsize in the file.
382 This is also recognized by vi, when it occurs near the
383 beginning or end of the file. (Will vi never die...?) */
384 int x;
Guido van Rossumb156d721990-12-20 23:13:00 +0000385 /* XXX The case to (unsigned char *) is needed by THINK C 3.0 */
386 if (sscanf(/*(unsigned char *)*/tok->cur,
Guido van Rossum3f5da241990-12-20 15:06:42 +0000387 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000388 x >= 1 && x <= 40) {
389 fprintf(stderr, "# vi:set tabsize=%d:\n", x);
390 tok->tabsize = x;
391 }
392 do {
393 c = tok_nextc(tok);
394 } while (c != EOF && c != '\n');
395 }
396
397 /* Check for EOF and errors now */
398 if (c == EOF)
399 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
400
401 /* Identifier (most frequent token!) */
402 if (isalpha(c) || c == '_') {
403 do {
404 c = tok_nextc(tok);
405 } while (isalnum(c) || c == '_');
406 tok_backup(tok, c);
407 *p_end = tok->cur;
408 return NAME;
409 }
410
411 /* Newline */
412 if (c == '\n') {
413 tok->atbol = 1;
414 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
415 return NEWLINE;
416 }
417
418 /* Number */
419 if (isdigit(c)) {
420 if (c == '0') {
421 /* Hex or octal */
422 c = tok_nextc(tok);
423 if (c == '.')
424 goto fraction;
425 if (c == 'x' || c == 'X') {
426 /* Hex */
427 do {
428 c = tok_nextc(tok);
429 } while (isxdigit(c));
430 }
431 else {
432 /* Octal; c is first char of it */
433 /* There's no 'isoctdigit' macro, sigh */
434 while ('0' <= c && c < '8') {
435 c = tok_nextc(tok);
436 }
437 }
Guido van Rossumf023c461991-05-05 20:16:20 +0000438 if (c == 'l' || c == 'L')
439 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000440 }
441 else {
442 /* Decimal */
443 do {
444 c = tok_nextc(tok);
445 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +0000446 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000447 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000448 else {
449 /* Accept floating point numbers.
450 XXX This accepts incomplete things like
451 XXX 12e or 1e+; worry run-time.
452 XXX Doesn't accept numbers
453 XXX starting with a dot */
454 if (c == '.') {
455 fraction:
456 /* Fraction */
457 do {
458 c = tok_nextc(tok);
459 } while (isdigit(c));
460 }
461 if (c == 'e' || c == 'E') {
462 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000463 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +0000464 if (c == '+' || c == '-')
465 c = tok_nextc(tok);
466 while (isdigit(c)) {
467 c = tok_nextc(tok);
468 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000469 }
470 }
471 }
472 tok_backup(tok, c);
473 *p_end = tok->cur;
474 return NUMBER;
475 }
476
477 /* String */
478 if (c == '\'') {
479 for (;;) {
480 c = tok_nextc(tok);
481 if (c == '\n' || c == EOF) {
482 tok->done = E_TOKEN;
483 return ERRORTOKEN;
484 }
485 if (c == '\\') {
486 c = tok_nextc(tok);
487 *p_end = tok->cur;
488 if (c == '\n' || c == EOF) {
489 tok->done = E_TOKEN;
490 return ERRORTOKEN;
491 }
492 continue;
493 }
494 if (c == '\'')
495 break;
496 }
497 *p_end = tok->cur;
498 return STRING;
499 }
500
501 /* Line continuation */
502 if (c == '\\') {
503 c = tok_nextc(tok);
504 if (c != '\n') {
505 tok->done = E_TOKEN;
506 return ERRORTOKEN;
507 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000508 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000509 goto again; /* Read next line */
510 }
511
512 /* Punctuation character */
513 *p_end = tok->cur;
514 return tok_1char(c);
515}
516
517
518#ifdef DEBUG
519
520void
521tok_dump(type, start, end)
522 int type;
523 char *start, *end;
524{
525 printf("%s", tok_name[type]);
526 if (type == NAME || type == NUMBER || type == STRING || type == OP)
527 printf("(%.*s)", (int)(end - start), start);
528}
529
530#endif