blob: 7e7a37025ca763f041369de912e6541f9814b868 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000131#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->decoding_readline = NULL;
133 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135 return tok;
136}
137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138#ifdef PGEN
139
140static char *
141decoding_fgets(char *s, int size, struct tok_state *tok)
142{
143 return fgets(s, size, tok->fp);
144}
145
146static int
147decoding_feof(struct tok_state *tok)
148{
149 return feof(tok->fp);
150}
151
152static const char *
153decode_str(const char *str, struct tok_state *tok)
154{
155 return str;
156}
157
158#else /* PGEN */
159
160static char *
161error_ret(struct tok_state *tok) /* XXX */
162{
163 tok->decoding_erred = 1;
164 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
165 PyMem_DEL(tok->buf);
166 tok->buf = NULL;
167 return NULL; /* as if it were EOF */
168}
169
170static char *
171new_string(const char *s, int len)
172{
173 char* result = PyMem_NEW(char, len + 1);
174 if (result != NULL) {
175 memcpy(result, s, len);
176 result[len] = '\0';
177 }
178 return result;
179}
180
181static char *
182get_normal_name(char *s) /* for utf-8 and latin-1 */
183{
184 char buf[13];
185 int i;
186 for (i = 0; i < 12; i++) {
187 int c = s[i];
188 if (c == '\0') break;
189 else if (c == '_') buf[i] = '-';
190 else buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
201 else return s;
202}
203
204/* Return the coding spec in S, or NULL if none is found. */
205
206static char *
207get_coding_spec(const char *s, int size)
208{
209 int i;
210 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
211 const char* t = s + i;
212 if (strncmp(t, "coding", 6) == 0) {
213 const char* begin = NULL;
214 t += 6;
215 if (t[0] != ':' && t[0] != '=')
216 continue;
217 do {
218 t++;
219 } while (t[0] == '\x20' || t[0] == '\t');
220
221 begin = t;
222 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
223 t[0] == '.')
224 t++;
225
226 if (begin < t) {
227 char* r = new_string(begin, t - begin);
228 char* q = get_normal_name(r);
229 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000230 PyMem_DEL(r);
231 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232 }
233 return r;
234 }
235 }
236 }
237 return NULL;
238}
239
240/* Check whether the line contains a coding spec. If it does,
241 invoke the set_readline function for the new encoding.
242 This function receives the tok_state and the new encoding.
243 Return 1 on success, 0 on failure. */
244
245static int
246check_coding_spec(const char* line, int size, struct tok_state *tok,
247 int set_readline(struct tok_state *, const char *))
248{
249 int r = 1;
250 char* cs = get_coding_spec(line, size);
251 if (cs != NULL) {
252 tok->read_coding_spec = 1;
253 if (tok->encoding == NULL) {
254 assert(tok->decoding_state == 1); /* raw */
255 if (strcmp(cs, "utf-8") == 0 ||
256 strcmp(cs, "iso-8859-1") == 0) {
257 tok->encoding = cs;
258 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000259#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 r = set_readline(tok, cs);
261 if (r) {
262 tok->encoding = cs;
263 tok->decoding_state = -1;
264 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000265#else
266 /* Without Unicode support, we cannot
267 process the coding spec. Since there
268 won't be any Unicode literals, that
269 won't matter. */
270#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 }
272 } else { /* then, compare cs with BOM */
273 r = (strcmp(tok->encoding, cs) == 0);
274 PyMem_DEL(cs);
275 }
276 }
277 return r;
278}
279
280/* See whether the file starts with a BOM. If it does,
281 invoke the set_readline function with the new encoding.
282 Return 1 on success, 0 on failure. */
283
284static int
285check_bom(int get_char(struct tok_state *),
286 void unget_char(int, struct tok_state *),
287 int set_readline(struct tok_state *, const char *),
288 struct tok_state *tok)
289{
290 int ch = get_char(tok);
291 tok->decoding_state = 1;
292 if (ch == EOF) {
293 return 1;
294 } else if (ch == 0xEF) {
295 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
296 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
297#if 0
298 /* Disable support for UTF-16 BOMs until a decision
299 is made whether this needs to be supported. */
300 } else if (ch == 0xFE) {
301 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
302 if (!set_readline(tok, "utf-16-be")) return 0;
303 tok->decoding_state = -1;
304 } else if (ch == 0xFF) {
305 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
306 if (!set_readline(tok, "utf-16-le")) return 0;
307 tok->decoding_state = -1;
308#endif
309 } else {
310 unget_char(ch, tok);
311 return 1;
312 }
313 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
314 return 1;
315 NON_BOM:
316 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
317 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
318 return 1;
319}
320
321/* Read a line of text from TOK into S, using the stream in TOK.
322 Return NULL on failure, else S. */
323
324static char *
325fp_readl(char *s, int size, struct tok_state *tok)
326{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000327#ifndef Py_USING_UNICODE
328 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000329 Py_FatalError("fp_readl should not be called in this build.");
330 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000331#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 PyObject* utf8;
333 PyObject* buf = tok->decoding_buffer;
334 if (buf == NULL) {
335 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000336 if (buf == NULL)
337 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 } else {
339 tok->decoding_buffer = NULL;
340 }
341 utf8 = PyUnicode_AsUTF8String(buf);
342 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000343 if (utf8 == NULL)
344 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000345 else {
346 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000347 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 strcpy(s, str);
349 Py_DECREF(utf8);
350 if (s[0] == '\0') return NULL; /* EOF */
351 return s;
352 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000353#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354}
355
356/* Set the readline function for TOK to a StreamReader's
357 readline function. The StreamReader is named ENC.
358
359 This function is called from check_bom and check_coding_spec.
360
361 ENC is usually identical to the future value of tok->encoding,
362 except for the (currently unsupported) case of UTF-16.
363
364 Return 1 on success, 0 on failure. */
365
366static int
367fp_setreadl(struct tok_state *tok, const char* enc)
368{
369 PyObject *reader, *stream, *readline;
370
371 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000372 if (stream == NULL)
373 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374
375 reader = PyCodec_StreamReader(enc, stream, NULL);
376 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000377 if (reader == NULL)
378 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379
380 readline = PyObject_GetAttrString(reader, "readline");
381 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000382 if (readline == NULL)
383 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384
385 tok->decoding_readline = readline;
386 return 1;
387}
388
389/* Fetch the next byte from TOK. */
390
391static int fp_getc(struct tok_state *tok) {
392 return getc(tok->fp);
393}
394
395/* Unfetch the last byte back into TOK. */
396
397static void fp_ungetc(int c, struct tok_state *tok) {
398 ungetc(c, tok->fp);
399}
400
401/* Read a line of input from TOK. Determine encoding
402 if necessary. */
403
404static char *
405decoding_fgets(char *s, int size, struct tok_state *tok)
406{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000407 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000408 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000409 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410 if (tok->decoding_state < 0) {
411 /* We already have a codec associated with
412 this input. */
413 line = fp_readl(s, size, tok);
414 break;
415 } else if (tok->decoding_state > 0) {
416 /* We want a 'raw' read. */
417 line = Py_UniversalNewlineFgets(s, size,
418 tok->fp, NULL);
419 warn = 1;
420 break;
421 } else {
422 /* We have not yet determined the encoding.
423 If an encoding is found, use the file-pointer
424 reader functions from now on. */
425 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
426 return error_ret(tok);
427 assert(tok->decoding_state != 0);
428 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
431 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
432 return error_ret(tok);
433 }
434 }
435#ifndef PGEN
436 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
437 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000438 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000439 if (*c > 127) {
440 badchar = *c;
441 break;
442 }
443 }
444 if (badchar) {
445 char buf[200];
446 sprintf(buf, "Non-ASCII character '\\x%.2x', "
447 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000448 /* Need to add 1 to the line number, since this line
449 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000450 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000451 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000452 NULL, NULL);
453 tok->issued_encoding_warning = 1;
454 }
455#endif
456 return line;
457}
458
459static int
460decoding_feof(struct tok_state *tok)
461{
462 if (tok->decoding_state >= 0) {
463 return feof(tok->fp);
464 } else {
465 PyObject* buf = tok->decoding_buffer;
466 if (buf == NULL) {
467 buf = PyObject_CallObject(tok->decoding_readline, NULL);
468 if (buf == NULL) {
469 error_ret(tok);
470 return 1;
471 } else {
472 tok->decoding_buffer = buf;
473 }
474 }
475 return PyObject_Length(buf) == 0;
476 }
477}
478
479/* Fetch a byte from TOK, using the string buffer. */
480
481static int buf_getc(struct tok_state *tok) {
482 return *tok->str++;
483}
484
485/* Unfetch a byte from TOK, using the string buffer. */
486
487static void buf_ungetc(int c, struct tok_state *tok) {
488 tok->str--;
489 assert(*tok->str == c); /* tok->cur may point to read-only segment */
490}
491
492/* Set the readline function for TOK to ENC. For the string-based
493 tokenizer, this means to just record the encoding. */
494
495static int buf_setreadl(struct tok_state *tok, const char* enc) {
496 tok->enc = enc;
497 return 1;
498}
499
500/* Return a UTF-8 encoding Python string object from the
501 C byte string STR, which is encoded with ENC. */
502
Martin v. Löwis019934b2002-08-07 12:33:18 +0000503#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504static PyObject *
505translate_into_utf8(const char* str, const char* enc) {
506 PyObject *utf8;
507 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
508 if (buf == NULL)
509 return NULL;
510 utf8 = PyUnicode_AsUTF8String(buf);
511 Py_DECREF(buf);
512 return utf8;
513}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000514#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515
516/* Decode a byte string STR for use as the buffer of TOK.
517 Look for encoding declarations inside STR, and record them
518 inside TOK. */
519
520static const char *
521decode_str(const char *str, struct tok_state *tok)
522{
523 PyObject* utf8 = NULL;
524 const char *s;
525 int lineno = 0;
526 tok->enc = NULL;
527 tok->str = str;
528 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
529 return NULL;
530 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000531 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000532#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533 if (tok->enc != NULL) {
534 utf8 = translate_into_utf8(str, tok->enc);
535 if (utf8 == NULL)
536 return NULL;
537 str = PyString_AsString(utf8);
538 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000539#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 for (s = str;; s++) {
541 if (*s == '\0') break;
542 else if (*s == '\n') {
543 lineno++;
544 if (lineno == 2) break;
545 }
546 }
547 tok->enc = NULL;
548 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
549 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000550#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 if (tok->enc != NULL) {
552 assert(utf8 == NULL);
553 utf8 = translate_into_utf8(str, tok->enc);
554 if (utf8 == NULL)
555 return NULL;
556 str = PyString_AsString(utf8);
557 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000558#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 assert(tok->decoding_buffer == NULL);
560 tok->decoding_buffer = utf8; /* CAUTION */
561 return str;
562}
563
564#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000565
566/* Set up tokenizer for string */
567
568struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000569PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000570{
571 struct tok_state *tok = tok_new();
572 if (tok == NULL)
573 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574 str = (char *)decode_str(str, tok);
575 if (str == NULL)
576 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000577 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000578 return tok;
579}
580
581
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000582/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000583
584struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000585PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000586{
587 struct tok_state *tok = tok_new();
588 if (tok == NULL)
589 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000590 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
591 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000592 return NULL;
593 }
594 tok->cur = tok->inp = tok->buf;
595 tok->end = tok->buf + BUFSIZ;
596 tok->fp = fp;
597 tok->prompt = ps1;
598 tok->nextprompt = ps2;
599 return tok;
600}
601
602
603/* Free a tok_state structure */
604
605void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000606PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000607{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 if (tok->encoding != NULL)
609 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000610#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611 Py_XDECREF(tok->decoding_readline);
612 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000613#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000614 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000615 PyMem_DEL(tok->buf);
616 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617}
618
619
620/* Get next char, updating state; error code goes into tok->done */
621
622static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000623tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000624{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000626 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000627 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000628 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000629 if (tok->done != E_OK)
630 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000632 char *end = strchr(tok->inp, '\n');
633 if (end != NULL)
634 end++;
635 else {
636 end = strchr(tok->inp, '\0');
637 if (end == tok->inp) {
638 tok->done = E_EOF;
639 return EOF;
640 }
641 }
642 if (tok->start == NULL)
643 tok->buf = tok->cur;
644 tok->lineno++;
645 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000646 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000649 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650 if (tok->nextprompt != NULL)
651 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000652 if (new == NULL)
653 tok->done = E_INTR;
654 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000655 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 tok->done = E_EOF;
657 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000658 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000659 size_t start = tok->start - tok->buf;
660 size_t oldlen = tok->cur - tok->buf;
661 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000662 char *buf = tok->buf;
663 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000664 tok->lineno++;
665 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000666 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000667 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000668 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000669 tok->done = E_NOMEM;
670 return EOF;
671 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000672 tok->buf = buf;
673 tok->cur = tok->buf + oldlen;
674 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000675 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000676 tok->inp = tok->buf + newlen;
677 tok->end = tok->inp + 1;
678 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000679 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000680 else {
681 tok->lineno++;
682 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000683 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000684 tok->buf = new;
685 tok->cur = tok->buf;
686 tok->inp = strchr(tok->buf, '\0');
687 tok->end = tok->inp + 1;
688 }
689 }
690 else {
691 int done = 0;
692 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000693 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000694 if (tok->start == NULL) {
695 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000696 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000697 if (tok->buf == NULL) {
698 tok->done = E_NOMEM;
699 return EOF;
700 }
701 tok->end = tok->buf + BUFSIZ;
702 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
704 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000705 tok->done = E_EOF;
706 done = 1;
707 }
708 else {
709 tok->done = E_OK;
710 tok->inp = strchr(tok->buf, '\0');
711 done = tok->inp[-1] == '\n';
712 }
713 }
714 else {
715 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000716 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000717 tok->done = E_EOF;
718 done = 1;
719 }
720 else
721 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000722 }
723 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000724 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000725 while (!done) {
726 int curstart = tok->start == NULL ? -1 :
727 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000728 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000729 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000730 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000731 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000732 if (newbuf == NULL) {
733 tok->done = E_NOMEM;
734 tok->cur = tok->inp;
735 return EOF;
736 }
737 tok->buf = newbuf;
738 tok->inp = tok->buf + curvalid;
739 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000740 tok->start = curstart < 0 ? NULL :
741 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000742 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000743 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000744 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000745 /* Last line does not end in \n,
746 fake one */
747 strcpy(tok->inp, "\n");
748 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000749 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000750 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000751 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000752 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000753#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000754 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000755 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000756 pt = tok->inp - 2;
757 if (pt >= tok->buf && *pt == '\r') {
758 *pt++ = '\n';
759 *pt = '\0';
760 tok->inp = pt;
761 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000762#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763 }
764 if (tok->done != E_OK) {
765 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000766 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000767 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768 return EOF;
769 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000771 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772}
773
774
775/* Back-up one character */
776
777static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000778tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779{
780 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000781 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000782 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 if (*tok->cur != c)
784 *tok->cur = c;
785 }
786}
787
788
789/* Return the token corresponding to a single character */
790
791int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000792PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793{
794 switch (c) {
795 case '(': return LPAR;
796 case ')': return RPAR;
797 case '[': return LSQB;
798 case ']': return RSQB;
799 case ':': return COLON;
800 case ',': return COMMA;
801 case ';': return SEMI;
802 case '+': return PLUS;
803 case '-': return MINUS;
804 case '*': return STAR;
805 case '/': return SLASH;
806 case '|': return VBAR;
807 case '&': return AMPER;
808 case '<': return LESS;
809 case '>': return GREATER;
810 case '=': return EQUAL;
811 case '.': return DOT;
812 case '%': return PERCENT;
813 case '`': return BACKQUOTE;
814 case '{': return LBRACE;
815 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000816 case '^': return CIRCUMFLEX;
817 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818 default: return OP;
819 }
820}
821
822
Guido van Rossumfbab9051991-10-20 20:25:03 +0000823int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000824PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000825{
826 switch (c1) {
827 case '=':
828 switch (c2) {
829 case '=': return EQEQUAL;
830 }
831 break;
832 case '!':
833 switch (c2) {
834 case '=': return NOTEQUAL;
835 }
836 break;
837 case '<':
838 switch (c2) {
839 case '>': return NOTEQUAL;
840 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000841 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000842 }
843 break;
844 case '>':
845 switch (c2) {
846 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000847 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000848 }
849 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000850 case '+':
851 switch (c2) {
852 case '=': return PLUSEQUAL;
853 }
854 break;
855 case '-':
856 switch (c2) {
857 case '=': return MINEQUAL;
858 }
859 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000860 case '*':
861 switch (c2) {
862 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000863 case '=': return STAREQUAL;
864 }
865 break;
866 case '/':
867 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000868 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000869 case '=': return SLASHEQUAL;
870 }
871 break;
872 case '|':
873 switch (c2) {
874 case '=': return VBAREQUAL;
875 }
876 break;
877 case '%':
878 switch (c2) {
879 case '=': return PERCENTEQUAL;
880 }
881 break;
882 case '&':
883 switch (c2) {
884 case '=': return AMPEREQUAL;
885 }
886 break;
887 case '^':
888 switch (c2) {
889 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000890 }
891 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000892 }
893 return OP;
894}
895
Thomas Wouters434d0822000-08-24 20:11:32 +0000896int
897PyToken_ThreeChars(int c1, int c2, int c3)
898{
899 switch (c1) {
900 case '<':
901 switch (c2) {
902 case '<':
903 switch (c3) {
904 case '=':
905 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000906 }
907 break;
908 }
909 break;
910 case '>':
911 switch (c2) {
912 case '>':
913 switch (c3) {
914 case '=':
915 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000916 }
917 break;
918 }
919 break;
920 case '*':
921 switch (c2) {
922 case '*':
923 switch (c3) {
924 case '=':
925 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000926 }
927 break;
928 }
929 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000930 case '/':
931 switch (c2) {
932 case '/':
933 switch (c3) {
934 case '=':
935 return DOUBLESLASHEQUAL;
936 }
937 break;
938 }
939 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000940 }
941 return OP;
942}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000943
Guido van Rossum926f13a1998-04-09 21:38:06 +0000944static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000945indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000946{
947 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000948 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000949 tok->cur = tok->inp;
950 return 1;
951 }
952 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000953 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
954 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000955 tok->altwarning = 0;
956 }
957 return 0;
958}
959
960
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000961/* Get next token, after space stripping etc. */
962
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000963static int
964tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000965{
966 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000967 int blankline;
968
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000969 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000970 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000971 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000972 blankline = 0;
973
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000974 /* Get indentation level */
975 if (tok->atbol) {
976 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000977 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000978 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000979 for (;;) {
980 c = tok_nextc(tok);
981 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000982 col++, altcol++;
983 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000984 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000985 altcol = (altcol/tok->alttabsize + 1)
986 * tok->alttabsize;
987 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000988 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000989 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990 else
991 break;
992 }
993 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000994 if (c == '#' || c == '\n') {
995 /* Lines with only whitespace and/or comments
996 shouldn't affect the indentation and are
997 not passed to the parser as NEWLINE tokens,
998 except *totally* empty lines in interactive
999 mode, which signal the end of a command group. */
1000 if (col == 0 && c == '\n' && tok->prompt != NULL)
1001 blankline = 0; /* Let it through */
1002 else
1003 blankline = 1; /* Ignore completely */
1004 /* We can't jump back right here since we still
1005 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001007 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001008 if (col == tok->indstack[tok->indent]) {
1009 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001010 if (altcol != tok->altindstack[tok->indent]) {
1011 if (indenterror(tok))
1012 return ERRORTOKEN;
1013 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001014 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001015 else if (col > tok->indstack[tok->indent]) {
1016 /* Indent -- always one */
1017 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001018 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001019 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001020 return ERRORTOKEN;
1021 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001022 if (altcol <= tok->altindstack[tok->indent]) {
1023 if (indenterror(tok))
1024 return ERRORTOKEN;
1025 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001026 tok->pendin++;
1027 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001028 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001029 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001030 else /* col < tok->indstack[tok->indent] */ {
1031 /* Dedent -- any number, must be consistent */
1032 while (tok->indent > 0 &&
1033 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001034 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001035 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001036 }
1037 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001038 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001039 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001040 return ERRORTOKEN;
1041 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001042 if (altcol != tok->altindstack[tok->indent]) {
1043 if (indenterror(tok))
1044 return ERRORTOKEN;
1045 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001046 }
1047 }
1048 }
1049
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001050 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001051
1052 /* Return pending indents/dedents */
1053 if (tok->pendin != 0) {
1054 if (tok->pendin < 0) {
1055 tok->pendin++;
1056 return DEDENT;
1057 }
1058 else {
1059 tok->pendin--;
1060 return INDENT;
1061 }
1062 }
1063
1064 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001065 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001066 /* Skip spaces */
1067 do {
1068 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001069 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001070
1071 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001072 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073
Guido van Rossumab5ca152000-03-31 00:52:27 +00001074 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001075 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001076 static char *tabforms[] = {
1077 "tab-width:", /* Emacs */
1078 ":tabstop=", /* vim, full form */
1079 ":ts=", /* vim, abbreviated form */
1080 "set tabsize=", /* will vi never die? */
1081 /* more templates can be added here to support other editors */
1082 };
1083 char cbuf[80];
1084 char *tp, **cp;
1085 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001087 *tp++ = c = tok_nextc(tok);
1088 } while (c != EOF && c != '\n' &&
1089 tp - cbuf + 1 < sizeof(cbuf));
1090 *tp = '\0';
1091 for (cp = tabforms;
1092 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1093 cp++) {
1094 if ((tp = strstr(cbuf, *cp))) {
1095 int newsize = atoi(tp + strlen(*cp));
1096
1097 if (newsize >= 1 && newsize <= 40) {
1098 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001099 if (Py_VerboseFlag)
1100 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001101 "Tab size set to %d\n",
1102 newsize);
1103 }
1104 }
1105 }
1106 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001107 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108 }
1109
1110 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001111 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001113 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114
1115 /* Identifier (most frequent token!) */
1116 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001117 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001118 switch (c) {
1119 case 'r':
1120 case 'R':
1121 c = tok_nextc(tok);
1122 if (c == '"' || c == '\'')
1123 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001124 break;
1125 case 'u':
1126 case 'U':
1127 c = tok_nextc(tok);
1128 if (c == 'r' || c == 'R')
1129 c = tok_nextc(tok);
1130 if (c == '"' || c == '\'')
1131 goto letter_quote;
1132 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001133 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001134 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001136 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001138 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001139 *p_end = tok->cur;
1140 return NAME;
1141 }
1142
1143 /* Newline */
1144 if (c == '\n') {
1145 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001146 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001147 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001148 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1150 return NEWLINE;
1151 }
1152
Guido van Rossum2d45be11997-04-11 19:16:25 +00001153#ifdef macintosh
1154 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001155 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001156 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001157 tok->done = E_TOKEN;
1158 tok->cur = tok->inp;
1159 return ERRORTOKEN;
1160 }
1161#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001162 /* Period or number starting with period? */
1163 if (c == '.') {
1164 c = tok_nextc(tok);
1165 if (isdigit(c)) {
1166 goto fraction;
1167 }
1168 else {
1169 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001170 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001171 *p_end = tok->cur;
1172 return DOT;
1173 }
1174 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001175
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 /* Number */
1177 if (isdigit(c)) {
1178 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001179 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 c = tok_nextc(tok);
1181 if (c == '.')
1182 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001183#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001184 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001185 goto imaginary;
1186#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187 if (c == 'x' || c == 'X') {
1188 /* Hex */
1189 do {
1190 c = tok_nextc(tok);
1191 } while (isxdigit(c));
1192 }
1193 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001194 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001195 /* Octal; c is first char of it */
1196 /* There's no 'isoctdigit' macro, sigh */
1197 while ('0' <= c && c < '8') {
1198 c = tok_nextc(tok);
1199 }
Tim Petersd507dab2001-08-30 20:51:59 +00001200 if (isdigit(c)) {
1201 found_decimal = 1;
1202 do {
1203 c = tok_nextc(tok);
1204 } while (isdigit(c));
1205 }
1206 if (c == '.')
1207 goto fraction;
1208 else if (c == 'e' || c == 'E')
1209 goto exponent;
1210#ifndef WITHOUT_COMPLEX
1211 else if (c == 'j' || c == 'J')
1212 goto imaginary;
1213#endif
1214 else if (found_decimal) {
1215 tok->done = E_TOKEN;
1216 tok_backup(tok, c);
1217 return ERRORTOKEN;
1218 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001219 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001220 if (c == 'l' || c == 'L')
1221 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 }
1223 else {
1224 /* Decimal */
1225 do {
1226 c = tok_nextc(tok);
1227 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001228 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001230 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001231 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001232 if (c == '.') {
1233 fraction:
1234 /* Fraction */
1235 do {
1236 c = tok_nextc(tok);
1237 } while (isdigit(c));
1238 }
1239 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001240 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001241 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001242 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001243 if (c == '+' || c == '-')
1244 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001245 if (!isdigit(c)) {
1246 tok->done = E_TOKEN;
1247 tok_backup(tok, c);
1248 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001249 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001250 do {
1251 c = tok_nextc(tok);
1252 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001254#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001255 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001256 /* Imaginary part */
1257 imaginary:
1258 c = tok_nextc(tok);
1259#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001260 }
1261 }
1262 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001263 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 *p_end = tok->cur;
1265 return NUMBER;
1266 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001267
1268 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001269 /* String */
1270 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001271 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001272 int quote = c;
1273 int triple = 0;
1274 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 for (;;) {
1276 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001277 if (c == '\n') {
1278 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001279 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001280 tok_backup(tok, c);
1281 return ERRORTOKEN;
1282 }
1283 tripcount = 0;
1284 }
1285 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001286 if (triple)
1287 tok->done = E_EOFS;
1288 else
1289 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001290 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 return ERRORTOKEN;
1292 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001293 else if (c == quote) {
1294 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001295 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001296 c = tok_nextc(tok);
1297 if (c == quote) {
1298 triple = 1;
1299 tripcount = 0;
1300 continue;
1301 }
1302 tok_backup(tok, c);
1303 }
1304 if (!triple || tripcount == 3)
1305 break;
1306 }
1307 else if (c == '\\') {
1308 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001310 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001311 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001312 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 return ERRORTOKEN;
1314 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001315 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001316 else
1317 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001319 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001320 *p_end = tok->cur;
1321 return STRING;
1322 }
1323
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 /* Line continuation */
1325 if (c == '\\') {
1326 c = tok_nextc(tok);
1327 if (c != '\n') {
1328 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001329 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001330 return ERRORTOKEN;
1331 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001332 goto again; /* Read next line */
1333 }
1334
Guido van Rossumfbab9051991-10-20 20:25:03 +00001335 /* Check for two-character token */
1336 {
1337 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001338 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001339 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001340 int c3 = tok_nextc(tok);
1341 int token3 = PyToken_ThreeChars(c, c2, c3);
1342 if (token3 != OP) {
1343 token = token3;
1344 } else {
1345 tok_backup(tok, c3);
1346 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001347 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001348 *p_end = tok->cur;
1349 return token;
1350 }
1351 tok_backup(tok, c2);
1352 }
1353
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001354 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001355 switch (c) {
1356 case '(':
1357 case '[':
1358 case '{':
1359 tok->level++;
1360 break;
1361 case ')':
1362 case ']':
1363 case '}':
1364 tok->level--;
1365 break;
1366 }
1367
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001369 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001370 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001371 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372}
1373
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001374int
1375PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1376{
1377 int result = tok_get(tok, p_start, p_end);
1378 if (tok->decoding_erred) {
1379 result = ERRORTOKEN;
1380 tok->done = E_DECODE;
1381 }
1382 return result;
1383}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001384
Guido van Rossum408027e1996-12-30 16:17:54 +00001385#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001386
1387void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001388tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389{
Guido van Rossum86bea461997-04-29 21:03:06 +00001390 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1392 printf("(%.*s)", (int)(end - start), start);
1393}
1394
1395#endif