blob: ad6f63ac02d6a129c215db9777ec51b7a72d7852 [file] [log] [blame]
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001/* Tokenizer implementation */
2
3/* XXX This is rather old, should be restructured perhaps */
4/* XXX Need a better interface to report errors than writing to stderr */
Guido van Rossum3f5da241990-12-20 15:06:42 +00005/* XXX Should use editor resource to fetch true tab size on Macintosh */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00006
Guido van Rossum3f5da241990-12-20 15:06:42 +00007#include "pgenheaders.h"
8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009#include <ctype.h>
10#include "string.h"
11
Guido van Rossum3f5da241990-12-20 15:06:42 +000012#include "fgetsintr.h"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000013#include "tokenizer.h"
14#include "errcode.h"
15
16#ifdef THINK_C
17#define TABSIZE 4
18#endif
19
20#ifndef TABSIZE
21#define TABSIZE 8
22#endif
23
Guido van Rossum3f5da241990-12-20 15:06:42 +000024/* Forward */
25static struct tok_state *tok_new PROTO((void));
26static int tok_nextc PROTO((struct tok_state *tok));
27static void tok_backup PROTO((struct tok_state *tok, int c));
28
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029/* Token names */
30
31char *tok_name[] = {
32 "ENDMARKER",
33 "NAME",
34 "NUMBER",
35 "STRING",
36 "NEWLINE",
37 "INDENT",
38 "DEDENT",
39 "LPAR",
40 "RPAR",
41 "LSQB",
42 "RSQB",
43 "COLON",
44 "COMMA",
45 "SEMI",
46 "PLUS",
47 "MINUS",
48 "STAR",
49 "SLASH",
50 "VBAR",
51 "AMPER",
52 "LESS",
53 "GREATER",
54 "EQUAL",
55 "DOT",
56 "PERCENT",
57 "BACKQUOTE",
58 "LBRACE",
59 "RBRACE",
60 "OP",
61 "<ERRORTOKEN>",
62 "<N_TOKENS>"
63};
64
65
66/* Create and initialize a new tok_state structure */
67
68static struct tok_state *
69tok_new()
70{
71 struct tok_state *tok = NEW(struct tok_state, 1);
72 if (tok == NULL)
73 return NULL;
74 tok->buf = tok->cur = tok->end = tok->inp = NULL;
75 tok->done = E_OK;
76 tok->fp = NULL;
77 tok->tabsize = TABSIZE;
78 tok->indent = 0;
79 tok->indstack[0] = 0;
80 tok->atbol = 1;
81 tok->pendin = 0;
82 tok->prompt = tok->nextprompt = NULL;
83 tok->lineno = 0;
84 return tok;
85}
86
87
88/* Set up tokenizer for string */
89
90struct tok_state *
91tok_setups(str)
92 char *str;
93{
94 struct tok_state *tok = tok_new();
95 if (tok == NULL)
96 return NULL;
97 tok->buf = tok->cur = str;
98 tok->end = tok->inp = strchr(str, '\0');
99 return tok;
100}
101
102
103/* Set up tokenizer for string */
104
105struct tok_state *
106tok_setupf(fp, ps1, ps2)
107 FILE *fp;
108 char *ps1, *ps2;
109{
110 struct tok_state *tok = tok_new();
111 if (tok == NULL)
112 return NULL;
113 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
114 DEL(tok);
115 return NULL;
116 }
117 tok->cur = tok->inp = tok->buf;
118 tok->end = tok->buf + BUFSIZ;
119 tok->fp = fp;
120 tok->prompt = ps1;
121 tok->nextprompt = ps2;
122 return tok;
123}
124
125
126/* Free a tok_state structure */
127
128void
129tok_free(tok)
130 struct tok_state *tok;
131{
132 /* XXX really need a separate flag to say 'my buffer' */
133 if (tok->fp != NULL && tok->buf != NULL)
134 DEL(tok->buf);
135 DEL(tok);
136}
137
138
139/* Get next char, updating state; error code goes into tok->done */
140
141static int
142tok_nextc(tok)
143 register struct tok_state *tok;
144{
145 if (tok->done != E_OK)
146 return EOF;
147
148 for (;;) {
149 if (tok->cur < tok->inp)
150 return *tok->cur++;
151 if (tok->fp == NULL) {
152 tok->done = E_EOF;
153 return EOF;
154 }
155 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
156 tok->inp = tok->buf;
157 if (tok->inp == tok->end) {
158 int n = tok->end - tok->buf;
159 char *new = tok->buf;
160 RESIZE(new, char, n+n);
161 if (new == NULL) {
162 fprintf(stderr, "tokenizer out of mem\n");
163 tok->done = E_NOMEM;
164 return EOF;
165 }
166 tok->buf = new;
167 tok->inp = tok->buf + n;
168 tok->end = tok->inp + n;
169 }
170#ifdef USE_READLINE
171 if (tok->prompt != NULL) {
172 extern char *readline PROTO((char *prompt));
173 static int been_here;
174 if (!been_here) {
175 /* Force rebind of TAB to insert-tab */
176 extern int rl_insert();
177 rl_bind_key('\t', rl_insert);
178 been_here++;
179 }
180 if (tok->buf != NULL)
181 free(tok->buf);
182 tok->buf = readline(tok->prompt);
183 (void) intrcheck(); /* Clear pending interrupt */
184 if (tok->nextprompt != NULL)
185 tok->prompt = tok->nextprompt;
186 /* XXX different semantics w/o readline()! */
187 if (tok->buf == NULL) {
188 tok->done = E_EOF;
189 }
190 else {
191 unsigned int n = strlen(tok->buf);
192 if (n > 0)
193 add_history(tok->buf);
194 /* Append the '\n' that readline()
195 doesn't give us, for the tokenizer... */
196 tok->buf = realloc(tok->buf, n+2);
197 if (tok->buf == NULL)
198 tok->done = E_NOMEM;
199 else {
200 tok->end = tok->buf + n;
201 *tok->end++ = '\n';
202 *tok->end = '\0';
203 tok->inp = tok->end;
204 tok->cur = tok->buf;
205 }
206 }
207 }
208 else
209#endif
210 {
211 tok->cur = tok->inp;
212 if (tok->prompt != NULL && tok->inp == tok->buf) {
213 fprintf(stderr, "%s", tok->prompt);
214 tok->prompt = tok->nextprompt;
215 }
216 tok->done = fgets_intr(tok->inp,
217 (int)(tok->end - tok->inp), tok->fp);
218 }
219 if (tok->done != E_OK) {
220 if (tok->prompt != NULL)
221 fprintf(stderr, "\n");
222 return EOF;
223 }
224 tok->inp = strchr(tok->inp, '\0');
225 }
226}
227
228
229/* Back-up one character */
230
231static void
232tok_backup(tok, c)
233 register struct tok_state *tok;
234 register int c;
235{
236 if (c != EOF) {
237 if (--tok->cur < tok->buf) {
238 fprintf(stderr, "tok_backup: begin of buffer\n");
239 abort();
240 }
241 if (*tok->cur != c)
242 *tok->cur = c;
243 }
244}
245
246
247/* Return the token corresponding to a single character */
248
249int
250tok_1char(c)
251 int c;
252{
253 switch (c) {
254 case '(': return LPAR;
255 case ')': return RPAR;
256 case '[': return LSQB;
257 case ']': return RSQB;
258 case ':': return COLON;
259 case ',': return COMMA;
260 case ';': return SEMI;
261 case '+': return PLUS;
262 case '-': return MINUS;
263 case '*': return STAR;
264 case '/': return SLASH;
265 case '|': return VBAR;
266 case '&': return AMPER;
267 case '<': return LESS;
268 case '>': return GREATER;
269 case '=': return EQUAL;
270 case '.': return DOT;
271 case '%': return PERCENT;
272 case '`': return BACKQUOTE;
273 case '{': return LBRACE;
274 case '}': return RBRACE;
275 default: return OP;
276 }
277}
278
279
280/* Get next token, after space stripping etc. */
281
282int
283tok_get(tok, p_start, p_end)
284 register struct tok_state *tok; /* In/out: tokenizer state */
285 char **p_start, **p_end; /* Out: point to start/end of token */
286{
287 register int c;
288
289 /* Get indentation level */
290 if (tok->atbol) {
291 register int col = 0;
292 tok->atbol = 0;
293 tok->lineno++;
294 for (;;) {
295 c = tok_nextc(tok);
296 if (c == ' ')
297 col++;
298 else if (c == '\t')
299 col = (col/tok->tabsize + 1) * tok->tabsize;
300 else
301 break;
302 }
303 tok_backup(tok, c);
304 if (col == tok->indstack[tok->indent]) {
305 /* No change */
306 }
307 else if (col > tok->indstack[tok->indent]) {
308 /* Indent -- always one */
309 if (tok->indent+1 >= MAXINDENT) {
310 fprintf(stderr, "excessive indent\n");
311 tok->done = E_TOKEN;
312 return ERRORTOKEN;
313 }
314 tok->pendin++;
315 tok->indstack[++tok->indent] = col;
316 }
317 else /* col < tok->indstack[tok->indent] */ {
318 /* Dedent -- any number, must be consistent */
319 while (tok->indent > 0 &&
320 col < tok->indstack[tok->indent]) {
321 tok->indent--;
322 tok->pendin--;
323 }
324 if (col != tok->indstack[tok->indent]) {
325 fprintf(stderr, "inconsistent dedent\n");
326 tok->done = E_TOKEN;
327 return ERRORTOKEN;
328 }
329 }
330 }
331
332 *p_start = *p_end = tok->cur;
333
334 /* Return pending indents/dedents */
335 if (tok->pendin != 0) {
336 if (tok->pendin < 0) {
337 tok->pendin++;
338 return DEDENT;
339 }
340 else {
341 tok->pendin--;
342 return INDENT;
343 }
344 }
345
346 again:
347 /* Skip spaces */
348 do {
349 c = tok_nextc(tok);
350 } while (c == ' ' || c == '\t');
351
352 /* Set start of current token */
353 *p_start = tok->cur - 1;
354
355 /* Skip comment */
356 if (c == '#') {
357 /* Hack to allow overriding the tabsize in the file.
358 This is also recognized by vi, when it occurs near the
359 beginning or end of the file. (Will vi never die...?) */
360 int x;
Guido van Rossum3f5da241990-12-20 15:06:42 +0000361 /* XXX The case to (unsigned char *) is needed by THINK C */
362 if (sscanf((unsigned char *)tok->cur,
363 " vi:set tabsize=%d:", &x) == 1 &&
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000364 x >= 1 && x <= 40) {
365 fprintf(stderr, "# vi:set tabsize=%d:\n", x);
366 tok->tabsize = x;
367 }
368 do {
369 c = tok_nextc(tok);
370 } while (c != EOF && c != '\n');
371 }
372
373 /* Check for EOF and errors now */
374 if (c == EOF)
375 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
376
377 /* Identifier (most frequent token!) */
378 if (isalpha(c) || c == '_') {
379 do {
380 c = tok_nextc(tok);
381 } while (isalnum(c) || c == '_');
382 tok_backup(tok, c);
383 *p_end = tok->cur;
384 return NAME;
385 }
386
387 /* Newline */
388 if (c == '\n') {
389 tok->atbol = 1;
390 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
391 return NEWLINE;
392 }
393
394 /* Number */
395 if (isdigit(c)) {
396 if (c == '0') {
397 /* Hex or octal */
398 c = tok_nextc(tok);
399 if (c == '.')
400 goto fraction;
401 if (c == 'x' || c == 'X') {
402 /* Hex */
403 do {
404 c = tok_nextc(tok);
405 } while (isxdigit(c));
406 }
407 else {
408 /* Octal; c is first char of it */
409 /* There's no 'isoctdigit' macro, sigh */
410 while ('0' <= c && c < '8') {
411 c = tok_nextc(tok);
412 }
413 }
414 }
415 else {
416 /* Decimal */
417 do {
418 c = tok_nextc(tok);
419 } while (isdigit(c));
420 /* Accept floating point numbers.
421 XXX This accepts incomplete things like 12e or 1e+;
422 worry about that at run-time.
423 XXX Doesn't accept numbers starting with a dot */
424 if (c == '.') {
425 fraction:
426 /* Fraction */
427 do {
428 c = tok_nextc(tok);
429 } while (isdigit(c));
430 }
431 if (c == 'e' || c == 'E') {
432 /* Exponent part */
433 c = tok_nextc(tok);
434 if (c == '+' || c == '-')
435 c = tok_nextc(tok);
436 while (isdigit(c)) {
437 c = tok_nextc(tok);
438 }
439 }
440 }
441 tok_backup(tok, c);
442 *p_end = tok->cur;
443 return NUMBER;
444 }
445
446 /* String */
447 if (c == '\'') {
448 for (;;) {
449 c = tok_nextc(tok);
450 if (c == '\n' || c == EOF) {
451 tok->done = E_TOKEN;
452 return ERRORTOKEN;
453 }
454 if (c == '\\') {
455 c = tok_nextc(tok);
456 *p_end = tok->cur;
457 if (c == '\n' || c == EOF) {
458 tok->done = E_TOKEN;
459 return ERRORTOKEN;
460 }
461 continue;
462 }
463 if (c == '\'')
464 break;
465 }
466 *p_end = tok->cur;
467 return STRING;
468 }
469
470 /* Line continuation */
471 if (c == '\\') {
472 c = tok_nextc(tok);
473 if (c != '\n') {
474 tok->done = E_TOKEN;
475 return ERRORTOKEN;
476 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000477 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000478 goto again; /* Read next line */
479 }
480
481 /* Punctuation character */
482 *p_end = tok->cur;
483 return tok_1char(c);
484}
485
486
487#ifdef DEBUG
488
489void
490tok_dump(type, start, end)
491 int type;
492 char *start, *end;
493{
494 printf("%s", tok_name[type]);
495 if (type == NAME || type == NUMBER || type == STRING || type == OP)
496 printf("(%.*s)", (int)(end - start), start);
497}
498
499#endif