blob: d7a223ae0f8a6bb8219d93d258dd078536a7cbd5 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
172new_string(const char *s, int len)
173{
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
208get_coding_spec(const char *s, int size)
209{
210 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000231 while (isalnum((int)t[0]) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
255check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000279#else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
289 }
290 }
291 return r;
292}
293
294/* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
297
298static int
299check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
303{
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311#if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322#endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
326 }
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
333}
334
335/* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
337
338static char *
339fp_readl(char *s, int size, struct tok_state *tok)
340{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000341#ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000343 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000344 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000345#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000349 PyObject *args = PyTuple_New(0);
350 if (args == NULL)
351 return error_ret(tok);
352 buf = PyObject_Call(tok->decoding_readline, args, NULL);
353 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000354 if (buf == NULL)
355 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356 } else {
357 tok->decoding_buffer = NULL;
358 }
359 utf8 = PyUnicode_AsUTF8String(buf);
360 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000361 if (utf8 == NULL)
362 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363 else {
364 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000365 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366 strcpy(s, str);
367 Py_DECREF(utf8);
368 if (s[0] == '\0') return NULL; /* EOF */
369 return s;
370 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000371#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372}
373
374/* Set the readline function for TOK to a StreamReader's
375 readline function. The StreamReader is named ENC.
376
377 This function is called from check_bom and check_coding_spec.
378
379 ENC is usually identical to the future value of tok->encoding,
380 except for the (currently unsupported) case of UTF-16.
381
382 Return 1 on success, 0 on failure. */
383
384static int
385fp_setreadl(struct tok_state *tok, const char* enc)
386{
387 PyObject *reader, *stream, *readline;
388
Martin v. Löwis95292d62002-12-11 14:04:59 +0000389 /* XXX: constify filename argument. */
390 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000391 if (stream == NULL)
392 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000393
394 reader = PyCodec_StreamReader(enc, stream, NULL);
395 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000396 if (reader == NULL)
397 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398
399 readline = PyObject_GetAttrString(reader, "readline");
400 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000401 if (readline == NULL)
402 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000403
404 tok->decoding_readline = readline;
405 return 1;
406}
407
408/* Fetch the next byte from TOK. */
409
410static int fp_getc(struct tok_state *tok) {
411 return getc(tok->fp);
412}
413
414/* Unfetch the last byte back into TOK. */
415
416static void fp_ungetc(int c, struct tok_state *tok) {
417 ungetc(c, tok->fp);
418}
419
420/* Read a line of input from TOK. Determine encoding
421 if necessary. */
422
423static char *
424decoding_fgets(char *s, int size, struct tok_state *tok)
425{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000426 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000427 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429 if (tok->decoding_state < 0) {
430 /* We already have a codec associated with
431 this input. */
432 line = fp_readl(s, size, tok);
433 break;
434 } else if (tok->decoding_state > 0) {
435 /* We want a 'raw' read. */
436 line = Py_UniversalNewlineFgets(s, size,
437 tok->fp, NULL);
438 warn = 1;
439 break;
440 } else {
441 /* We have not yet determined the encoding.
442 If an encoding is found, use the file-pointer
443 reader functions from now on. */
444 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
445 return error_ret(tok);
446 assert(tok->decoding_state != 0);
447 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000448 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
450 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
451 return error_ret(tok);
452 }
453 }
454#ifndef PGEN
455 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
456 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000457 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458 if (*c > 127) {
459 badchar = *c;
460 break;
461 }
462 }
463 if (badchar) {
464 char buf[200];
465 sprintf(buf, "Non-ASCII character '\\x%.2x', "
466 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000467 /* Need to add 1 to the line number, since this line
468 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000470 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471 NULL, NULL);
472 tok->issued_encoding_warning = 1;
473 }
474#endif
475 return line;
476}
477
478static int
479decoding_feof(struct tok_state *tok)
480{
481 if (tok->decoding_state >= 0) {
482 return feof(tok->fp);
483 } else {
484 PyObject* buf = tok->decoding_buffer;
485 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000486 PyObject *args = PyTuple_New(0);
487 if (args == NULL) {
488 error_ret(tok);
489 return 1;
490 }
491 buf = PyObject_Call(tok->decoding_readline,
492 args, NULL);
493 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494 if (buf == NULL) {
495 error_ret(tok);
496 return 1;
497 } else {
498 tok->decoding_buffer = buf;
499 }
500 }
501 return PyObject_Length(buf) == 0;
502 }
503}
504
505/* Fetch a byte from TOK, using the string buffer. */
506
507static int buf_getc(struct tok_state *tok) {
508 return *tok->str++;
509}
510
511/* Unfetch a byte from TOK, using the string buffer. */
512
513static void buf_ungetc(int c, struct tok_state *tok) {
514 tok->str--;
515 assert(*tok->str == c); /* tok->cur may point to read-only segment */
516}
517
518/* Set the readline function for TOK to ENC. For the string-based
519 tokenizer, this means to just record the encoding. */
520
521static int buf_setreadl(struct tok_state *tok, const char* enc) {
522 tok->enc = enc;
523 return 1;
524}
525
526/* Return a UTF-8 encoding Python string object from the
527 C byte string STR, which is encoded with ENC. */
528
Martin v. Löwis019934b2002-08-07 12:33:18 +0000529#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530static PyObject *
531translate_into_utf8(const char* str, const char* enc) {
532 PyObject *utf8;
533 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
534 if (buf == NULL)
535 return NULL;
536 utf8 = PyUnicode_AsUTF8String(buf);
537 Py_DECREF(buf);
538 return utf8;
539}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000540#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541
542/* Decode a byte string STR for use as the buffer of TOK.
543 Look for encoding declarations inside STR, and record them
544 inside TOK. */
545
546static const char *
547decode_str(const char *str, struct tok_state *tok)
548{
549 PyObject* utf8 = NULL;
550 const char *s;
551 int lineno = 0;
552 tok->enc = NULL;
553 tok->str = str;
554 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
555 return NULL;
556 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000557 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000558#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 if (tok->enc != NULL) {
560 utf8 = translate_into_utf8(str, tok->enc);
561 if (utf8 == NULL)
562 return NULL;
563 str = PyString_AsString(utf8);
564 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000565#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000566 for (s = str;; s++) {
567 if (*s == '\0') break;
568 else if (*s == '\n') {
569 lineno++;
570 if (lineno == 2) break;
571 }
572 }
573 tok->enc = NULL;
574 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
575 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000576#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577 if (tok->enc != NULL) {
578 assert(utf8 == NULL);
579 utf8 = translate_into_utf8(str, tok->enc);
580 if (utf8 == NULL)
581 return NULL;
582 str = PyString_AsString(utf8);
583 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000584#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 assert(tok->decoding_buffer == NULL);
586 tok->decoding_buffer = utf8; /* CAUTION */
587 return str;
588}
589
590#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000591
592/* Set up tokenizer for string */
593
594struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000595PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000596{
597 struct tok_state *tok = tok_new();
598 if (tok == NULL)
599 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600 str = (char *)decode_str(str, tok);
601 if (str == NULL)
602 return NULL;
Martin v. Löwis95292d62002-12-11 14:04:59 +0000603 /* XXX: constify members. */
604 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000605 return tok;
606}
607
608
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000609/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000610
611struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000612PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000613{
614 struct tok_state *tok = tok_new();
615 if (tok == NULL)
616 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000617 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
618 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619 return NULL;
620 }
621 tok->cur = tok->inp = tok->buf;
622 tok->end = tok->buf + BUFSIZ;
623 tok->fp = fp;
624 tok->prompt = ps1;
625 tok->nextprompt = ps2;
626 return tok;
627}
628
629
630/* Free a tok_state structure */
631
632void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000633PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000634{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635 if (tok->encoding != NULL)
636 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000637#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638 Py_XDECREF(tok->decoding_readline);
639 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000640#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000641 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000642 PyMem_DEL(tok->buf);
643 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000644}
645
646
647/* Get next char, updating state; error code goes into tok->done */
648
649static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000650tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000651{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000653 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000654 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000655 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000656 if (tok->done != E_OK)
657 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000659 char *end = strchr(tok->inp, '\n');
660 if (end != NULL)
661 end++;
662 else {
663 end = strchr(tok->inp, '\0');
664 if (end == tok->inp) {
665 tok->done = E_EOF;
666 return EOF;
667 }
668 }
669 if (tok->start == NULL)
670 tok->buf = tok->cur;
671 tok->lineno++;
672 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000673 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000674 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000676 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677 if (tok->nextprompt != NULL)
678 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000679 if (new == NULL)
680 tok->done = E_INTR;
681 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000682 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683 tok->done = E_EOF;
684 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000685 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000686 size_t start = tok->start - tok->buf;
687 size_t oldlen = tok->cur - tok->buf;
688 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000689 char *buf = tok->buf;
690 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000691 tok->lineno++;
692 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000693 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000694 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000695 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000696 tok->done = E_NOMEM;
697 return EOF;
698 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000699 tok->buf = buf;
700 tok->cur = tok->buf + oldlen;
701 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000702 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000703 tok->inp = tok->buf + newlen;
704 tok->end = tok->inp + 1;
705 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000706 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000707 else {
708 tok->lineno++;
709 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000710 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000711 tok->buf = new;
712 tok->cur = tok->buf;
713 tok->inp = strchr(tok->buf, '\0');
714 tok->end = tok->inp + 1;
715 }
716 }
717 else {
718 int done = 0;
719 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000720 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000721 if (tok->start == NULL) {
722 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000723 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000724 if (tok->buf == NULL) {
725 tok->done = E_NOMEM;
726 return EOF;
727 }
728 tok->end = tok->buf + BUFSIZ;
729 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000730 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
731 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000732 tok->done = E_EOF;
733 done = 1;
734 }
735 else {
736 tok->done = E_OK;
737 tok->inp = strchr(tok->buf, '\0');
738 done = tok->inp[-1] == '\n';
739 }
740 }
741 else {
742 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000743 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000744 tok->done = E_EOF;
745 done = 1;
746 }
747 else
748 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000749 }
750 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000751 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000752 while (!done) {
753 int curstart = tok->start == NULL ? -1 :
754 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000755 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000756 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000757 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000758 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000759 if (newbuf == NULL) {
760 tok->done = E_NOMEM;
761 tok->cur = tok->inp;
762 return EOF;
763 }
764 tok->buf = newbuf;
765 tok->inp = tok->buf + curvalid;
766 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000767 tok->start = curstart < 0 ? NULL :
768 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000769 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000770 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000771 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000772 /* Last line does not end in \n,
773 fake one */
774 strcpy(tok->inp, "\n");
775 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000776 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000777 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000778 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000780#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000781 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000782 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000783 pt = tok->inp - 2;
784 if (pt >= tok->buf && *pt == '\r') {
785 *pt++ = '\n';
786 *pt = '\0';
787 tok->inp = pt;
788 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000789#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000790 }
791 if (tok->done != E_OK) {
792 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000793 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000794 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000795 return EOF;
796 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000798 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799}
800
801
802/* Back-up one character */
803
804static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000805tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806{
807 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000808 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000809 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810 if (*tok->cur != c)
811 *tok->cur = c;
812 }
813}
814
815
816/* Return the token corresponding to a single character */
817
818int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000819PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820{
821 switch (c) {
822 case '(': return LPAR;
823 case ')': return RPAR;
824 case '[': return LSQB;
825 case ']': return RSQB;
826 case ':': return COLON;
827 case ',': return COMMA;
828 case ';': return SEMI;
829 case '+': return PLUS;
830 case '-': return MINUS;
831 case '*': return STAR;
832 case '/': return SLASH;
833 case '|': return VBAR;
834 case '&': return AMPER;
835 case '<': return LESS;
836 case '>': return GREATER;
837 case '=': return EQUAL;
838 case '.': return DOT;
839 case '%': return PERCENT;
840 case '`': return BACKQUOTE;
841 case '{': return LBRACE;
842 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000843 case '^': return CIRCUMFLEX;
844 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845 default: return OP;
846 }
847}
848
849
Guido van Rossumfbab9051991-10-20 20:25:03 +0000850int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000851PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000852{
853 switch (c1) {
854 case '=':
855 switch (c2) {
856 case '=': return EQEQUAL;
857 }
858 break;
859 case '!':
860 switch (c2) {
861 case '=': return NOTEQUAL;
862 }
863 break;
864 case '<':
865 switch (c2) {
866 case '>': return NOTEQUAL;
867 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000868 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000869 }
870 break;
871 case '>':
872 switch (c2) {
873 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000874 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000875 }
876 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000877 case '+':
878 switch (c2) {
879 case '=': return PLUSEQUAL;
880 }
881 break;
882 case '-':
883 switch (c2) {
884 case '=': return MINEQUAL;
885 }
886 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000887 case '*':
888 switch (c2) {
889 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000890 case '=': return STAREQUAL;
891 }
892 break;
893 case '/':
894 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000895 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000896 case '=': return SLASHEQUAL;
897 }
898 break;
899 case '|':
900 switch (c2) {
901 case '=': return VBAREQUAL;
902 }
903 break;
904 case '%':
905 switch (c2) {
906 case '=': return PERCENTEQUAL;
907 }
908 break;
909 case '&':
910 switch (c2) {
911 case '=': return AMPEREQUAL;
912 }
913 break;
914 case '^':
915 switch (c2) {
916 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000917 }
918 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000919 }
920 return OP;
921}
922
Thomas Wouters434d0822000-08-24 20:11:32 +0000923int
924PyToken_ThreeChars(int c1, int c2, int c3)
925{
926 switch (c1) {
927 case '<':
928 switch (c2) {
929 case '<':
930 switch (c3) {
931 case '=':
932 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000933 }
934 break;
935 }
936 break;
937 case '>':
938 switch (c2) {
939 case '>':
940 switch (c3) {
941 case '=':
942 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000943 }
944 break;
945 }
946 break;
947 case '*':
948 switch (c2) {
949 case '*':
950 switch (c3) {
951 case '=':
952 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000953 }
954 break;
955 }
956 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000957 case '/':
958 switch (c2) {
959 case '/':
960 switch (c3) {
961 case '=':
962 return DOUBLESLASHEQUAL;
963 }
964 break;
965 }
966 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000967 }
968 return OP;
969}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000970
Guido van Rossum926f13a1998-04-09 21:38:06 +0000971static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000972indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000973{
974 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000975 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000976 tok->cur = tok->inp;
977 return 1;
978 }
979 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000980 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
981 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000982 tok->altwarning = 0;
983 }
984 return 0;
985}
986
987
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000988/* Get next token, after space stripping etc. */
989
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000990static int
991tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000992{
993 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000994 int blankline;
995
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000996 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000997 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000998 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000999 blankline = 0;
1000
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001001 /* Get indentation level */
1002 if (tok->atbol) {
1003 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001004 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001005 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006 for (;;) {
1007 c = tok_nextc(tok);
1008 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001009 col++, altcol++;
1010 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001011 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001012 altcol = (altcol/tok->alttabsize + 1)
1013 * tok->alttabsize;
1014 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001015 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001016 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001017 else
1018 break;
1019 }
1020 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001021 if (c == '#' || c == '\n') {
1022 /* Lines with only whitespace and/or comments
1023 shouldn't affect the indentation and are
1024 not passed to the parser as NEWLINE tokens,
1025 except *totally* empty lines in interactive
1026 mode, which signal the end of a command group. */
1027 if (col == 0 && c == '\n' && tok->prompt != NULL)
1028 blankline = 0; /* Let it through */
1029 else
1030 blankline = 1; /* Ignore completely */
1031 /* We can't jump back right here since we still
1032 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001033 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001034 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001035 if (col == tok->indstack[tok->indent]) {
1036 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001037 if (altcol != tok->altindstack[tok->indent]) {
1038 if (indenterror(tok))
1039 return ERRORTOKEN;
1040 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001042 else if (col > tok->indstack[tok->indent]) {
1043 /* Indent -- always one */
1044 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001045 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001046 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001047 return ERRORTOKEN;
1048 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001049 if (altcol <= tok->altindstack[tok->indent]) {
1050 if (indenterror(tok))
1051 return ERRORTOKEN;
1052 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001053 tok->pendin++;
1054 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001055 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001057 else /* col < tok->indstack[tok->indent] */ {
1058 /* Dedent -- any number, must be consistent */
1059 while (tok->indent > 0 &&
1060 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001061 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001062 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001063 }
1064 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001065 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001066 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001067 return ERRORTOKEN;
1068 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001069 if (altcol != tok->altindstack[tok->indent]) {
1070 if (indenterror(tok))
1071 return ERRORTOKEN;
1072 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073 }
1074 }
1075 }
1076
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001077 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001078
1079 /* Return pending indents/dedents */
1080 if (tok->pendin != 0) {
1081 if (tok->pendin < 0) {
1082 tok->pendin++;
1083 return DEDENT;
1084 }
1085 else {
1086 tok->pendin--;
1087 return INDENT;
1088 }
1089 }
1090
1091 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001092 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093 /* Skip spaces */
1094 do {
1095 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001096 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097
1098 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001099 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001100
Guido van Rossumab5ca152000-03-31 00:52:27 +00001101 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001103 static char *tabforms[] = {
1104 "tab-width:", /* Emacs */
1105 ":tabstop=", /* vim, full form */
1106 ":ts=", /* vim, abbreviated form */
1107 "set tabsize=", /* will vi never die? */
1108 /* more templates can be added here to support other editors */
1109 };
1110 char cbuf[80];
1111 char *tp, **cp;
1112 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001113 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001114 *tp++ = c = tok_nextc(tok);
1115 } while (c != EOF && c != '\n' &&
1116 tp - cbuf + 1 < sizeof(cbuf));
1117 *tp = '\0';
1118 for (cp = tabforms;
1119 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1120 cp++) {
1121 if ((tp = strstr(cbuf, *cp))) {
1122 int newsize = atoi(tp + strlen(*cp));
1123
1124 if (newsize >= 1 && newsize <= 40) {
1125 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001126 if (Py_VerboseFlag)
1127 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001128 "Tab size set to %d\n",
1129 newsize);
1130 }
1131 }
1132 }
1133 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 }
1136
1137 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001138 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001139 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001140 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141
1142 /* Identifier (most frequent token!) */
1143 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001144 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001145 switch (c) {
1146 case 'r':
1147 case 'R':
1148 c = tok_nextc(tok);
1149 if (c == '"' || c == '\'')
1150 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001151 break;
1152 case 'u':
1153 case 'U':
1154 c = tok_nextc(tok);
1155 if (c == 'r' || c == 'R')
1156 c = tok_nextc(tok);
1157 if (c == '"' || c == '\'')
1158 goto letter_quote;
1159 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001160 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001161 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001162 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001163 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001165 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001166 *p_end = tok->cur;
1167 return NAME;
1168 }
1169
1170 /* Newline */
1171 if (c == '\n') {
1172 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001173 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001175 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001177 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001178 return NEWLINE;
1179 }
1180
Guido van Rossum2d45be11997-04-11 19:16:25 +00001181#ifdef macintosh
1182 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001183 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001184 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001185 tok->done = E_TOKEN;
1186 tok->cur = tok->inp;
1187 return ERRORTOKEN;
1188 }
1189#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001190 /* Period or number starting with period? */
1191 if (c == '.') {
1192 c = tok_nextc(tok);
1193 if (isdigit(c)) {
1194 goto fraction;
1195 }
1196 else {
1197 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001198 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001199 *p_end = tok->cur;
1200 return DOT;
1201 }
1202 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001203
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 /* Number */
1205 if (isdigit(c)) {
1206 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001207 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208 c = tok_nextc(tok);
1209 if (c == '.')
1210 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001211#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001212 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001213 goto imaginary;
1214#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 if (c == 'x' || c == 'X') {
1216 /* Hex */
1217 do {
1218 c = tok_nextc(tok);
1219 } while (isxdigit(c));
1220 }
1221 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001222 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 /* Octal; c is first char of it */
1224 /* There's no 'isoctdigit' macro, sigh */
1225 while ('0' <= c && c < '8') {
1226 c = tok_nextc(tok);
1227 }
Tim Petersd507dab2001-08-30 20:51:59 +00001228 if (isdigit(c)) {
1229 found_decimal = 1;
1230 do {
1231 c = tok_nextc(tok);
1232 } while (isdigit(c));
1233 }
1234 if (c == '.')
1235 goto fraction;
1236 else if (c == 'e' || c == 'E')
1237 goto exponent;
1238#ifndef WITHOUT_COMPLEX
1239 else if (c == 'j' || c == 'J')
1240 goto imaginary;
1241#endif
1242 else if (found_decimal) {
1243 tok->done = E_TOKEN;
1244 tok_backup(tok, c);
1245 return ERRORTOKEN;
1246 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001247 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001248 if (c == 'l' || c == 'L')
1249 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 }
1251 else {
1252 /* Decimal */
1253 do {
1254 c = tok_nextc(tok);
1255 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001256 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001258 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001259 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001260 if (c == '.') {
1261 fraction:
1262 /* Fraction */
1263 do {
1264 c = tok_nextc(tok);
1265 } while (isdigit(c));
1266 }
1267 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001268 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001269 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001270 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001271 if (c == '+' || c == '-')
1272 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001273 if (!isdigit(c)) {
1274 tok->done = E_TOKEN;
1275 tok_backup(tok, c);
1276 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001277 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001278 do {
1279 c = tok_nextc(tok);
1280 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001282#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001283 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001284 /* Imaginary part */
1285 imaginary:
1286 c = tok_nextc(tok);
1287#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 }
1289 }
1290 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001291 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 *p_end = tok->cur;
1293 return NUMBER;
1294 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001295
1296 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001297 /* String */
1298 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001299 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001300 int quote = c;
1301 int triple = 0;
1302 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001303 for (;;) {
1304 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001305 if (c == '\n') {
1306 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001307 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001308 tok_backup(tok, c);
1309 return ERRORTOKEN;
1310 }
1311 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001312 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001313 }
1314 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001315 if (triple)
1316 tok->done = E_EOFS;
1317 else
1318 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001319 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 return ERRORTOKEN;
1321 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001322 else if (c == quote) {
1323 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001324 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001325 c = tok_nextc(tok);
1326 if (c == quote) {
1327 triple = 1;
1328 tripcount = 0;
1329 continue;
1330 }
1331 tok_backup(tok, c);
1332 }
1333 if (!triple || tripcount == 3)
1334 break;
1335 }
1336 else if (c == '\\') {
1337 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001339 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001340 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001341 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342 return ERRORTOKEN;
1343 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001345 else
1346 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001348 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001349 *p_end = tok->cur;
1350 return STRING;
1351 }
1352
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 /* Line continuation */
1354 if (c == '\\') {
1355 c = tok_nextc(tok);
1356 if (c != '\n') {
1357 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001358 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001359 return ERRORTOKEN;
1360 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001361 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 goto again; /* Read next line */
1363 }
1364
Guido van Rossumfbab9051991-10-20 20:25:03 +00001365 /* Check for two-character token */
1366 {
1367 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001368 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001369 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001370 int c3 = tok_nextc(tok);
1371 int token3 = PyToken_ThreeChars(c, c2, c3);
1372 if (token3 != OP) {
1373 token = token3;
1374 } else {
1375 tok_backup(tok, c3);
1376 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001377 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001378 *p_end = tok->cur;
1379 return token;
1380 }
1381 tok_backup(tok, c2);
1382 }
1383
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001385 switch (c) {
1386 case '(':
1387 case '[':
1388 case '{':
1389 tok->level++;
1390 break;
1391 case ')':
1392 case ']':
1393 case '}':
1394 tok->level--;
1395 break;
1396 }
1397
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001398 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001400 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001401 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402}
1403
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001404int
1405PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1406{
1407 int result = tok_get(tok, p_start, p_end);
1408 if (tok->decoding_erred) {
1409 result = ERRORTOKEN;
1410 tok->done = E_DECODE;
1411 }
1412 return result;
1413}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414
Guido van Rossum408027e1996-12-30 16:17:54 +00001415#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001416
1417void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001418tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419{
Guido van Rossum86bea461997-04-29 21:03:06 +00001420 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1422 printf("(%.*s)", (int)(end - start), start);
1423}
1424
1425#endif