blob: 97eda75794c77be26c1ef48ff69b89cc33cbe525 [file] [log] [blame]
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001/* Tokenizer implementation */
2
3/* XXX This is rather old, should be restructured perhaps */
4/* XXX Need a better interface to report errors than writing to stderr */
5
6#include <stdio.h>
7#include <ctype.h>
8#include "string.h"
9
10#include "PROTO.h"
11#include "malloc.h"
12#include "tokenizer.h"
13#include "errcode.h"
14
15#ifdef THINK_C
16#define TABSIZE 4
17#endif
18
19#ifndef TABSIZE
20#define TABSIZE 8
21#endif
22
23/* Token names */
24
25char *tok_name[] = {
26 "ENDMARKER",
27 "NAME",
28 "NUMBER",
29 "STRING",
30 "NEWLINE",
31 "INDENT",
32 "DEDENT",
33 "LPAR",
34 "RPAR",
35 "LSQB",
36 "RSQB",
37 "COLON",
38 "COMMA",
39 "SEMI",
40 "PLUS",
41 "MINUS",
42 "STAR",
43 "SLASH",
44 "VBAR",
45 "AMPER",
46 "LESS",
47 "GREATER",
48 "EQUAL",
49 "DOT",
50 "PERCENT",
51 "BACKQUOTE",
52 "LBRACE",
53 "RBRACE",
54 "OP",
55 "<ERRORTOKEN>",
56 "<N_TOKENS>"
57};
58
59
60/* Create and initialize a new tok_state structure */
61
62static struct tok_state *
63tok_new()
64{
65 struct tok_state *tok = NEW(struct tok_state, 1);
66 if (tok == NULL)
67 return NULL;
68 tok->buf = tok->cur = tok->end = tok->inp = NULL;
69 tok->done = E_OK;
70 tok->fp = NULL;
71 tok->tabsize = TABSIZE;
72 tok->indent = 0;
73 tok->indstack[0] = 0;
74 tok->atbol = 1;
75 tok->pendin = 0;
76 tok->prompt = tok->nextprompt = NULL;
77 tok->lineno = 0;
78 return tok;
79}
80
81
82/* Set up tokenizer for string */
83
84struct tok_state *
85tok_setups(str)
86 char *str;
87{
88 struct tok_state *tok = tok_new();
89 if (tok == NULL)
90 return NULL;
91 tok->buf = tok->cur = str;
92 tok->end = tok->inp = strchr(str, '\0');
93 return tok;
94}
95
96
97/* Set up tokenizer for string */
98
99struct tok_state *
100tok_setupf(fp, ps1, ps2)
101 FILE *fp;
102 char *ps1, *ps2;
103{
104 struct tok_state *tok = tok_new();
105 if (tok == NULL)
106 return NULL;
107 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
108 DEL(tok);
109 return NULL;
110 }
111 tok->cur = tok->inp = tok->buf;
112 tok->end = tok->buf + BUFSIZ;
113 tok->fp = fp;
114 tok->prompt = ps1;
115 tok->nextprompt = ps2;
116 return tok;
117}
118
119
120/* Free a tok_state structure */
121
122void
123tok_free(tok)
124 struct tok_state *tok;
125{
126 /* XXX really need a separate flag to say 'my buffer' */
127 if (tok->fp != NULL && tok->buf != NULL)
128 DEL(tok->buf);
129 DEL(tok);
130}
131
132
133/* Get next char, updating state; error code goes into tok->done */
134
135static int
136tok_nextc(tok)
137 register struct tok_state *tok;
138{
139 if (tok->done != E_OK)
140 return EOF;
141
142 for (;;) {
143 if (tok->cur < tok->inp)
144 return *tok->cur++;
145 if (tok->fp == NULL) {
146 tok->done = E_EOF;
147 return EOF;
148 }
149 if (tok->inp > tok->buf && tok->inp[-1] == '\n')
150 tok->inp = tok->buf;
151 if (tok->inp == tok->end) {
152 int n = tok->end - tok->buf;
153 char *new = tok->buf;
154 RESIZE(new, char, n+n);
155 if (new == NULL) {
156 fprintf(stderr, "tokenizer out of mem\n");
157 tok->done = E_NOMEM;
158 return EOF;
159 }
160 tok->buf = new;
161 tok->inp = tok->buf + n;
162 tok->end = tok->inp + n;
163 }
164#ifdef USE_READLINE
165 if (tok->prompt != NULL) {
166 extern char *readline PROTO((char *prompt));
167 static int been_here;
168 if (!been_here) {
169 /* Force rebind of TAB to insert-tab */
170 extern int rl_insert();
171 rl_bind_key('\t', rl_insert);
172 been_here++;
173 }
174 if (tok->buf != NULL)
175 free(tok->buf);
176 tok->buf = readline(tok->prompt);
177 (void) intrcheck(); /* Clear pending interrupt */
178 if (tok->nextprompt != NULL)
179 tok->prompt = tok->nextprompt;
180 /* XXX different semantics w/o readline()! */
181 if (tok->buf == NULL) {
182 tok->done = E_EOF;
183 }
184 else {
185 unsigned int n = strlen(tok->buf);
186 if (n > 0)
187 add_history(tok->buf);
188 /* Append the '\n' that readline()
189 doesn't give us, for the tokenizer... */
190 tok->buf = realloc(tok->buf, n+2);
191 if (tok->buf == NULL)
192 tok->done = E_NOMEM;
193 else {
194 tok->end = tok->buf + n;
195 *tok->end++ = '\n';
196 *tok->end = '\0';
197 tok->inp = tok->end;
198 tok->cur = tok->buf;
199 }
200 }
201 }
202 else
203#endif
204 {
205 tok->cur = tok->inp;
206 if (tok->prompt != NULL && tok->inp == tok->buf) {
207 fprintf(stderr, "%s", tok->prompt);
208 tok->prompt = tok->nextprompt;
209 }
210 tok->done = fgets_intr(tok->inp,
211 (int)(tok->end - tok->inp), tok->fp);
212 }
213 if (tok->done != E_OK) {
214 if (tok->prompt != NULL)
215 fprintf(stderr, "\n");
216 return EOF;
217 }
218 tok->inp = strchr(tok->inp, '\0');
219 }
220}
221
222
223/* Back-up one character */
224
225static void
226tok_backup(tok, c)
227 register struct tok_state *tok;
228 register int c;
229{
230 if (c != EOF) {
231 if (--tok->cur < tok->buf) {
232 fprintf(stderr, "tok_backup: begin of buffer\n");
233 abort();
234 }
235 if (*tok->cur != c)
236 *tok->cur = c;
237 }
238}
239
240
241/* Return the token corresponding to a single character */
242
243int
244tok_1char(c)
245 int c;
246{
247 switch (c) {
248 case '(': return LPAR;
249 case ')': return RPAR;
250 case '[': return LSQB;
251 case ']': return RSQB;
252 case ':': return COLON;
253 case ',': return COMMA;
254 case ';': return SEMI;
255 case '+': return PLUS;
256 case '-': return MINUS;
257 case '*': return STAR;
258 case '/': return SLASH;
259 case '|': return VBAR;
260 case '&': return AMPER;
261 case '<': return LESS;
262 case '>': return GREATER;
263 case '=': return EQUAL;
264 case '.': return DOT;
265 case '%': return PERCENT;
266 case '`': return BACKQUOTE;
267 case '{': return LBRACE;
268 case '}': return RBRACE;
269 default: return OP;
270 }
271}
272
273
274/* Get next token, after space stripping etc. */
275
276int
277tok_get(tok, p_start, p_end)
278 register struct tok_state *tok; /* In/out: tokenizer state */
279 char **p_start, **p_end; /* Out: point to start/end of token */
280{
281 register int c;
282
283 /* Get indentation level */
284 if (tok->atbol) {
285 register int col = 0;
286 tok->atbol = 0;
287 tok->lineno++;
288 for (;;) {
289 c = tok_nextc(tok);
290 if (c == ' ')
291 col++;
292 else if (c == '\t')
293 col = (col/tok->tabsize + 1) * tok->tabsize;
294 else
295 break;
296 }
297 tok_backup(tok, c);
298 if (col == tok->indstack[tok->indent]) {
299 /* No change */
300 }
301 else if (col > tok->indstack[tok->indent]) {
302 /* Indent -- always one */
303 if (tok->indent+1 >= MAXINDENT) {
304 fprintf(stderr, "excessive indent\n");
305 tok->done = E_TOKEN;
306 return ERRORTOKEN;
307 }
308 tok->pendin++;
309 tok->indstack[++tok->indent] = col;
310 }
311 else /* col < tok->indstack[tok->indent] */ {
312 /* Dedent -- any number, must be consistent */
313 while (tok->indent > 0 &&
314 col < tok->indstack[tok->indent]) {
315 tok->indent--;
316 tok->pendin--;
317 }
318 if (col != tok->indstack[tok->indent]) {
319 fprintf(stderr, "inconsistent dedent\n");
320 tok->done = E_TOKEN;
321 return ERRORTOKEN;
322 }
323 }
324 }
325
326 *p_start = *p_end = tok->cur;
327
328 /* Return pending indents/dedents */
329 if (tok->pendin != 0) {
330 if (tok->pendin < 0) {
331 tok->pendin++;
332 return DEDENT;
333 }
334 else {
335 tok->pendin--;
336 return INDENT;
337 }
338 }
339
340 again:
341 /* Skip spaces */
342 do {
343 c = tok_nextc(tok);
344 } while (c == ' ' || c == '\t');
345
346 /* Set start of current token */
347 *p_start = tok->cur - 1;
348
349 /* Skip comment */
350 if (c == '#') {
351 /* Hack to allow overriding the tabsize in the file.
352 This is also recognized by vi, when it occurs near the
353 beginning or end of the file. (Will vi never die...?) */
354 int x;
355 if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 &&
356 x >= 1 && x <= 40) {
357 fprintf(stderr, "# vi:set tabsize=%d:\n", x);
358 tok->tabsize = x;
359 }
360 do {
361 c = tok_nextc(tok);
362 } while (c != EOF && c != '\n');
363 }
364
365 /* Check for EOF and errors now */
366 if (c == EOF)
367 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
368
369 /* Identifier (most frequent token!) */
370 if (isalpha(c) || c == '_') {
371 do {
372 c = tok_nextc(tok);
373 } while (isalnum(c) || c == '_');
374 tok_backup(tok, c);
375 *p_end = tok->cur;
376 return NAME;
377 }
378
379 /* Newline */
380 if (c == '\n') {
381 tok->atbol = 1;
382 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
383 return NEWLINE;
384 }
385
386 /* Number */
387 if (isdigit(c)) {
388 if (c == '0') {
389 /* Hex or octal */
390 c = tok_nextc(tok);
391 if (c == '.')
392 goto fraction;
393 if (c == 'x' || c == 'X') {
394 /* Hex */
395 do {
396 c = tok_nextc(tok);
397 } while (isxdigit(c));
398 }
399 else {
400 /* Octal; c is first char of it */
401 /* There's no 'isoctdigit' macro, sigh */
402 while ('0' <= c && c < '8') {
403 c = tok_nextc(tok);
404 }
405 }
406 }
407 else {
408 /* Decimal */
409 do {
410 c = tok_nextc(tok);
411 } while (isdigit(c));
412 /* Accept floating point numbers.
413 XXX This accepts incomplete things like 12e or 1e+;
414 worry about that at run-time.
415 XXX Doesn't accept numbers starting with a dot */
416 if (c == '.') {
417 fraction:
418 /* Fraction */
419 do {
420 c = tok_nextc(tok);
421 } while (isdigit(c));
422 }
423 if (c == 'e' || c == 'E') {
424 /* Exponent part */
425 c = tok_nextc(tok);
426 if (c == '+' || c == '-')
427 c = tok_nextc(tok);
428 while (isdigit(c)) {
429 c = tok_nextc(tok);
430 }
431 }
432 }
433 tok_backup(tok, c);
434 *p_end = tok->cur;
435 return NUMBER;
436 }
437
438 /* String */
439 if (c == '\'') {
440 for (;;) {
441 c = tok_nextc(tok);
442 if (c == '\n' || c == EOF) {
443 tok->done = E_TOKEN;
444 return ERRORTOKEN;
445 }
446 if (c == '\\') {
447 c = tok_nextc(tok);
448 *p_end = tok->cur;
449 if (c == '\n' || c == EOF) {
450 tok->done = E_TOKEN;
451 return ERRORTOKEN;
452 }
453 continue;
454 }
455 if (c == '\'')
456 break;
457 }
458 *p_end = tok->cur;
459 return STRING;
460 }
461
462 /* Line continuation */
463 if (c == '\\') {
464 c = tok_nextc(tok);
465 if (c != '\n') {
466 tok->done = E_TOKEN;
467 return ERRORTOKEN;
468 }
Guido van Rossuma7691721990-11-09 15:08:39 +0000469 tok->lineno++;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000470 goto again; /* Read next line */
471 }
472
473 /* Punctuation character */
474 *p_end = tok->cur;
475 return tok_1char(c);
476}
477
478
479#ifdef DEBUG
480
481void
482tok_dump(type, start, end)
483 int type;
484 char *start, *end;
485{
486 printf("%s", tok_name[type]);
487 if (type == NAME || type == NUMBER || type == STRING || type == OP)
488 printf("(%.*s)", (int)(end - start), start);
489}
490
491#endif