blob: a97720c54cf3d1544f7b7eed4e1d834091eb5819 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
172new_string(const char *s, int len)
173{
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
208get_coding_spec(const char *s, int size)
209{
210 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000231 while (isalnum((int)t[0]) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
255check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000279#else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
289 }
290 }
291 return r;
292}
293
294/* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
297
298static int
299check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
303{
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311#if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322#endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
326 }
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
333}
334
335/* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
337
338static char *
339fp_readl(char *s, int size, struct tok_state *tok)
340{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000341#ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000343 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000344 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000345#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
Mark Hammonda2e303c2003-01-14 23:15:22 +0000349 /* Ask for one less byte so we can terminate it */
350 PyObject *args = Py_BuildValue("(i)", size-1);
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000351 if (args == NULL)
352 return error_ret(tok);
353 buf = PyObject_Call(tok->decoding_readline, args, NULL);
354 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000355 if (buf == NULL)
356 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000357 } else {
358 tok->decoding_buffer = NULL;
359 }
360 utf8 = PyUnicode_AsUTF8String(buf);
361 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000362 if (utf8 == NULL)
363 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 else {
365 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000366 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367 strcpy(s, str);
368 Py_DECREF(utf8);
369 if (s[0] == '\0') return NULL; /* EOF */
370 return s;
371 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000372#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Set the readline function for TOK to a StreamReader's
376 readline function. The StreamReader is named ENC.
377
378 This function is called from check_bom and check_coding_spec.
379
380 ENC is usually identical to the future value of tok->encoding,
381 except for the (currently unsupported) case of UTF-16.
382
383 Return 1 on success, 0 on failure. */
384
385static int
386fp_setreadl(struct tok_state *tok, const char* enc)
387{
388 PyObject *reader, *stream, *readline;
389
Martin v. Löwis95292d62002-12-11 14:04:59 +0000390 /* XXX: constify filename argument. */
391 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000392 if (stream == NULL)
393 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394
395 reader = PyCodec_StreamReader(enc, stream, NULL);
396 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000397 if (reader == NULL)
398 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399
400 readline = PyObject_GetAttrString(reader, "readline");
401 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000402 if (readline == NULL)
403 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404
405 tok->decoding_readline = readline;
406 return 1;
407}
408
409/* Fetch the next byte from TOK. */
410
411static int fp_getc(struct tok_state *tok) {
412 return getc(tok->fp);
413}
414
415/* Unfetch the last byte back into TOK. */
416
417static void fp_ungetc(int c, struct tok_state *tok) {
418 ungetc(c, tok->fp);
419}
420
421/* Read a line of input from TOK. Determine encoding
422 if necessary. */
423
424static char *
425decoding_fgets(char *s, int size, struct tok_state *tok)
426{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000427 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430 if (tok->decoding_state < 0) {
431 /* We already have a codec associated with
432 this input. */
433 line = fp_readl(s, size, tok);
434 break;
435 } else if (tok->decoding_state > 0) {
436 /* We want a 'raw' read. */
437 line = Py_UniversalNewlineFgets(s, size,
438 tok->fp, NULL);
439 warn = 1;
440 break;
441 } else {
442 /* We have not yet determined the encoding.
443 If an encoding is found, use the file-pointer
444 reader functions from now on. */
445 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
446 return error_ret(tok);
447 assert(tok->decoding_state != 0);
448 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000449 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000450 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
451 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
452 return error_ret(tok);
453 }
454 }
455#ifndef PGEN
456 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
457 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000458 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459 if (*c > 127) {
460 badchar = *c;
461 break;
462 }
463 }
464 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000465 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000466 /* Need to add 1 to the line number, since this line
467 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000468 sprintf(buf,
469 "Non-ASCII character '\\x%.2x' "
470 "in file %.200s on line %i, "
471 "but no encoding declared; "
472 "see http://www.python.org/peps/pep-0263.html for details",
473 badchar, tok->filename, tok->lineno + 1);
474 /* We don't use PyErr_WarnExplicit() here because
475 printing the line in question to e.g. a log file
476 could result in sensitive information being
477 exposed. */
478 PyErr_Warn(PyExc_DeprecationWarning, buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 tok->issued_encoding_warning = 1;
480 }
481#endif
482 return line;
483}
484
485static int
486decoding_feof(struct tok_state *tok)
487{
488 if (tok->decoding_state >= 0) {
489 return feof(tok->fp);
490 } else {
491 PyObject* buf = tok->decoding_buffer;
492 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000493 PyObject *args = PyTuple_New(0);
494 if (args == NULL) {
495 error_ret(tok);
496 return 1;
497 }
498 buf = PyObject_Call(tok->decoding_readline,
499 args, NULL);
500 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000501 if (buf == NULL) {
502 error_ret(tok);
503 return 1;
504 } else {
505 tok->decoding_buffer = buf;
506 }
507 }
508 return PyObject_Length(buf) == 0;
509 }
510}
511
512/* Fetch a byte from TOK, using the string buffer. */
513
514static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000515 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516}
517
518/* Unfetch a byte from TOK, using the string buffer. */
519
520static void buf_ungetc(int c, struct tok_state *tok) {
521 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000522 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523}
524
525/* Set the readline function for TOK to ENC. For the string-based
526 tokenizer, this means to just record the encoding. */
527
528static int buf_setreadl(struct tok_state *tok, const char* enc) {
529 tok->enc = enc;
530 return 1;
531}
532
533/* Return a UTF-8 encoding Python string object from the
534 C byte string STR, which is encoded with ENC. */
535
Martin v. Löwis019934b2002-08-07 12:33:18 +0000536#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537static PyObject *
538translate_into_utf8(const char* str, const char* enc) {
539 PyObject *utf8;
540 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
541 if (buf == NULL)
542 return NULL;
543 utf8 = PyUnicode_AsUTF8String(buf);
544 Py_DECREF(buf);
545 return utf8;
546}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000547#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548
549/* Decode a byte string STR for use as the buffer of TOK.
550 Look for encoding declarations inside STR, and record them
551 inside TOK. */
552
553static const char *
554decode_str(const char *str, struct tok_state *tok)
555{
556 PyObject* utf8 = NULL;
557 const char *s;
558 int lineno = 0;
559 tok->enc = NULL;
560 tok->str = str;
561 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
562 return NULL;
563 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000564 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000565#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000566 if (tok->enc != NULL) {
567 utf8 = translate_into_utf8(str, tok->enc);
568 if (utf8 == NULL)
569 return NULL;
570 str = PyString_AsString(utf8);
571 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000572#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573 for (s = str;; s++) {
574 if (*s == '\0') break;
575 else if (*s == '\n') {
576 lineno++;
577 if (lineno == 2) break;
578 }
579 }
580 tok->enc = NULL;
581 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
582 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000583#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584 if (tok->enc != NULL) {
585 assert(utf8 == NULL);
586 utf8 = translate_into_utf8(str, tok->enc);
587 if (utf8 == NULL)
588 return NULL;
589 str = PyString_AsString(utf8);
590 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000591#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 assert(tok->decoding_buffer == NULL);
593 tok->decoding_buffer = utf8; /* CAUTION */
594 return str;
595}
596
597#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000598
599/* Set up tokenizer for string */
600
601struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000602PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603{
604 struct tok_state *tok = tok_new();
605 if (tok == NULL)
606 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 str = (char *)decode_str(str, tok);
608 if (str == NULL)
609 return NULL;
Martin v. Löwis95292d62002-12-11 14:04:59 +0000610 /* XXX: constify members. */
611 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000612 return tok;
613}
614
615
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000616/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617
618struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000619PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000620{
621 struct tok_state *tok = tok_new();
622 if (tok == NULL)
623 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000624 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
625 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000626 return NULL;
627 }
628 tok->cur = tok->inp = tok->buf;
629 tok->end = tok->buf + BUFSIZ;
630 tok->fp = fp;
631 tok->prompt = ps1;
632 tok->nextprompt = ps2;
633 return tok;
634}
635
636
637/* Free a tok_state structure */
638
639void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000640PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000641{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 if (tok->encoding != NULL)
643 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000644#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645 Py_XDECREF(tok->decoding_readline);
646 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000647#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000649 PyMem_DEL(tok->buf);
650 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000651}
652
653
654/* Get next char, updating state; error code goes into tok->done */
655
656static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000657tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000660 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000661 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000662 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000663 if (tok->done != E_OK)
664 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000665 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000666 char *end = strchr(tok->inp, '\n');
667 if (end != NULL)
668 end++;
669 else {
670 end = strchr(tok->inp, '\0');
671 if (end == tok->inp) {
672 tok->done = E_EOF;
673 return EOF;
674 }
675 }
676 if (tok->start == NULL)
677 tok->buf = tok->cur;
678 tok->lineno++;
679 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000680 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000683 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 if (tok->nextprompt != NULL)
685 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000686 if (new == NULL)
687 tok->done = E_INTR;
688 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000689 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000690 tok->done = E_EOF;
691 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000692 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000693 size_t start = tok->start - tok->buf;
694 size_t oldlen = tok->cur - tok->buf;
695 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000696 char *buf = tok->buf;
697 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000698 tok->lineno++;
699 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000700 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000701 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000702 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000703 tok->done = E_NOMEM;
704 return EOF;
705 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000706 tok->buf = buf;
707 tok->cur = tok->buf + oldlen;
708 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000709 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000710 tok->inp = tok->buf + newlen;
711 tok->end = tok->inp + 1;
712 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000713 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000714 else {
715 tok->lineno++;
716 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000717 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000718 tok->buf = new;
719 tok->cur = tok->buf;
720 tok->inp = strchr(tok->buf, '\0');
721 tok->end = tok->inp + 1;
722 }
723 }
724 else {
725 int done = 0;
726 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000727 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000728 if (tok->start == NULL) {
729 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000730 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000731 if (tok->buf == NULL) {
732 tok->done = E_NOMEM;
733 return EOF;
734 }
735 tok->end = tok->buf + BUFSIZ;
736 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000737 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
738 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000739 tok->done = E_EOF;
740 done = 1;
741 }
742 else {
743 tok->done = E_OK;
744 tok->inp = strchr(tok->buf, '\0');
745 done = tok->inp[-1] == '\n';
746 }
747 }
748 else {
749 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000750 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000751 tok->done = E_EOF;
752 done = 1;
753 }
754 else
755 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000756 }
757 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000758 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000759 while (!done) {
760 int curstart = tok->start == NULL ? -1 :
761 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000762 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000763 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000764 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000765 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000766 if (newbuf == NULL) {
767 tok->done = E_NOMEM;
768 tok->cur = tok->inp;
769 return EOF;
770 }
771 tok->buf = newbuf;
772 tok->inp = tok->buf + curvalid;
773 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000774 tok->start = curstart < 0 ? NULL :
775 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000776 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000777 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000778 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 /* Last line does not end in \n,
780 fake one */
781 strcpy(tok->inp, "\n");
782 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000783 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000784 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000785 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000786 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000787#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000788 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000789 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000790 pt = tok->inp - 2;
791 if (pt >= tok->buf && *pt == '\r') {
792 *pt++ = '\n';
793 *pt = '\0';
794 tok->inp = pt;
795 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000796#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 }
798 if (tok->done != E_OK) {
799 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000800 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000801 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802 return EOF;
803 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000804 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000805 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806}
807
808
809/* Back-up one character */
810
811static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000812tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
814 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000815 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000816 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817 if (*tok->cur != c)
818 *tok->cur = c;
819 }
820}
821
822
823/* Return the token corresponding to a single character */
824
825int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000826PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000827{
828 switch (c) {
829 case '(': return LPAR;
830 case ')': return RPAR;
831 case '[': return LSQB;
832 case ']': return RSQB;
833 case ':': return COLON;
834 case ',': return COMMA;
835 case ';': return SEMI;
836 case '+': return PLUS;
837 case '-': return MINUS;
838 case '*': return STAR;
839 case '/': return SLASH;
840 case '|': return VBAR;
841 case '&': return AMPER;
842 case '<': return LESS;
843 case '>': return GREATER;
844 case '=': return EQUAL;
845 case '.': return DOT;
846 case '%': return PERCENT;
847 case '`': return BACKQUOTE;
848 case '{': return LBRACE;
849 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000850 case '^': return CIRCUMFLEX;
851 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852 default: return OP;
853 }
854}
855
856
Guido van Rossumfbab9051991-10-20 20:25:03 +0000857int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000858PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000859{
860 switch (c1) {
861 case '=':
862 switch (c2) {
863 case '=': return EQEQUAL;
864 }
865 break;
866 case '!':
867 switch (c2) {
868 case '=': return NOTEQUAL;
869 }
870 break;
871 case '<':
872 switch (c2) {
873 case '>': return NOTEQUAL;
874 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000875 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000876 }
877 break;
878 case '>':
879 switch (c2) {
880 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000881 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000882 }
883 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000884 case '+':
885 switch (c2) {
886 case '=': return PLUSEQUAL;
887 }
888 break;
889 case '-':
890 switch (c2) {
891 case '=': return MINEQUAL;
892 }
893 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000894 case '*':
895 switch (c2) {
896 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000897 case '=': return STAREQUAL;
898 }
899 break;
900 case '/':
901 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000902 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000903 case '=': return SLASHEQUAL;
904 }
905 break;
906 case '|':
907 switch (c2) {
908 case '=': return VBAREQUAL;
909 }
910 break;
911 case '%':
912 switch (c2) {
913 case '=': return PERCENTEQUAL;
914 }
915 break;
916 case '&':
917 switch (c2) {
918 case '=': return AMPEREQUAL;
919 }
920 break;
921 case '^':
922 switch (c2) {
923 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000924 }
925 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000926 }
927 return OP;
928}
929
Thomas Wouters434d0822000-08-24 20:11:32 +0000930int
931PyToken_ThreeChars(int c1, int c2, int c3)
932{
933 switch (c1) {
934 case '<':
935 switch (c2) {
936 case '<':
937 switch (c3) {
938 case '=':
939 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000940 }
941 break;
942 }
943 break;
944 case '>':
945 switch (c2) {
946 case '>':
947 switch (c3) {
948 case '=':
949 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000950 }
951 break;
952 }
953 break;
954 case '*':
955 switch (c2) {
956 case '*':
957 switch (c3) {
958 case '=':
959 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000960 }
961 break;
962 }
963 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000964 case '/':
965 switch (c2) {
966 case '/':
967 switch (c3) {
968 case '=':
969 return DOUBLESLASHEQUAL;
970 }
971 break;
972 }
973 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000974 }
975 return OP;
976}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000977
Guido van Rossum926f13a1998-04-09 21:38:06 +0000978static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000979indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000980{
981 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000982 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000983 tok->cur = tok->inp;
984 return 1;
985 }
986 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000987 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
988 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000989 tok->altwarning = 0;
990 }
991 return 0;
992}
993
994
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995/* Get next token, after space stripping etc. */
996
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000997static int
998tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000999{
1000 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001001 int blankline;
1002
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001003 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001004 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001005 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001006 blankline = 0;
1007
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001008 /* Get indentation level */
1009 if (tok->atbol) {
1010 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001011 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001012 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001013 for (;;) {
1014 c = tok_nextc(tok);
1015 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001016 col++, altcol++;
1017 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001018 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001019 altcol = (altcol/tok->alttabsize + 1)
1020 * tok->alttabsize;
1021 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001022 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001023 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001024 else
1025 break;
1026 }
1027 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001028 if (c == '#' || c == '\n') {
1029 /* Lines with only whitespace and/or comments
1030 shouldn't affect the indentation and are
1031 not passed to the parser as NEWLINE tokens,
1032 except *totally* empty lines in interactive
1033 mode, which signal the end of a command group. */
1034 if (col == 0 && c == '\n' && tok->prompt != NULL)
1035 blankline = 0; /* Let it through */
1036 else
1037 blankline = 1; /* Ignore completely */
1038 /* We can't jump back right here since we still
1039 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001040 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001041 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001042 if (col == tok->indstack[tok->indent]) {
1043 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001044 if (altcol != tok->altindstack[tok->indent]) {
1045 if (indenterror(tok))
1046 return ERRORTOKEN;
1047 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001049 else if (col > tok->indstack[tok->indent]) {
1050 /* Indent -- always one */
1051 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001052 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001053 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001054 return ERRORTOKEN;
1055 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001056 if (altcol <= tok->altindstack[tok->indent]) {
1057 if (indenterror(tok))
1058 return ERRORTOKEN;
1059 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001060 tok->pendin++;
1061 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001062 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001063 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001064 else /* col < tok->indstack[tok->indent] */ {
1065 /* Dedent -- any number, must be consistent */
1066 while (tok->indent > 0 &&
1067 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001068 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001069 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001070 }
1071 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001072 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001073 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001074 return ERRORTOKEN;
1075 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001076 if (altcol != tok->altindstack[tok->indent]) {
1077 if (indenterror(tok))
1078 return ERRORTOKEN;
1079 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001080 }
1081 }
1082 }
1083
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001084 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001085
1086 /* Return pending indents/dedents */
1087 if (tok->pendin != 0) {
1088 if (tok->pendin < 0) {
1089 tok->pendin++;
1090 return DEDENT;
1091 }
1092 else {
1093 tok->pendin--;
1094 return INDENT;
1095 }
1096 }
1097
1098 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001099 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001100 /* Skip spaces */
1101 do {
1102 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001103 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104
1105 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001106 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001107
Guido van Rossumab5ca152000-03-31 00:52:27 +00001108 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001110 static char *tabforms[] = {
1111 "tab-width:", /* Emacs */
1112 ":tabstop=", /* vim, full form */
1113 ":ts=", /* vim, abbreviated form */
1114 "set tabsize=", /* will vi never die? */
1115 /* more templates can be added here to support other editors */
1116 };
1117 char cbuf[80];
1118 char *tp, **cp;
1119 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001121 *tp++ = c = tok_nextc(tok);
1122 } while (c != EOF && c != '\n' &&
1123 tp - cbuf + 1 < sizeof(cbuf));
1124 *tp = '\0';
1125 for (cp = tabforms;
1126 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1127 cp++) {
1128 if ((tp = strstr(cbuf, *cp))) {
1129 int newsize = atoi(tp + strlen(*cp));
1130
1131 if (newsize >= 1 && newsize <= 40) {
1132 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001133 if (Py_VerboseFlag)
1134 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001135 "Tab size set to %d\n",
1136 newsize);
1137 }
1138 }
1139 }
1140 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142 }
1143
1144 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001145 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001147 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001148
1149 /* Identifier (most frequent token!) */
1150 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001151 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001152 switch (c) {
1153 case 'r':
1154 case 'R':
1155 c = tok_nextc(tok);
1156 if (c == '"' || c == '\'')
1157 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001158 break;
1159 case 'u':
1160 case 'U':
1161 c = tok_nextc(tok);
1162 if (c == 'r' || c == 'R')
1163 c = tok_nextc(tok);
1164 if (c == '"' || c == '\'')
1165 goto letter_quote;
1166 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001167 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001168 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001169 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001170 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001171 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001172 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001173 *p_end = tok->cur;
1174 return NAME;
1175 }
1176
1177 /* Newline */
1178 if (c == '\n') {
1179 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001180 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001181 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001182 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001183 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001184 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 return NEWLINE;
1186 }
1187
Guido van Rossum2d45be11997-04-11 19:16:25 +00001188#ifdef macintosh
1189 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001190 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001191 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001192 tok->done = E_TOKEN;
1193 tok->cur = tok->inp;
1194 return ERRORTOKEN;
1195 }
1196#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001197 /* Period or number starting with period? */
1198 if (c == '.') {
1199 c = tok_nextc(tok);
1200 if (isdigit(c)) {
1201 goto fraction;
1202 }
1203 else {
1204 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001205 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001206 *p_end = tok->cur;
1207 return DOT;
1208 }
1209 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001210
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 /* Number */
1212 if (isdigit(c)) {
1213 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001214 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 c = tok_nextc(tok);
1216 if (c == '.')
1217 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001218#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001219 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001220 goto imaginary;
1221#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 if (c == 'x' || c == 'X') {
1223 /* Hex */
1224 do {
1225 c = tok_nextc(tok);
1226 } while (isxdigit(c));
1227 }
1228 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001229 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001230 /* Octal; c is first char of it */
1231 /* There's no 'isoctdigit' macro, sigh */
1232 while ('0' <= c && c < '8') {
1233 c = tok_nextc(tok);
1234 }
Tim Petersd507dab2001-08-30 20:51:59 +00001235 if (isdigit(c)) {
1236 found_decimal = 1;
1237 do {
1238 c = tok_nextc(tok);
1239 } while (isdigit(c));
1240 }
1241 if (c == '.')
1242 goto fraction;
1243 else if (c == 'e' || c == 'E')
1244 goto exponent;
1245#ifndef WITHOUT_COMPLEX
1246 else if (c == 'j' || c == 'J')
1247 goto imaginary;
1248#endif
1249 else if (found_decimal) {
1250 tok->done = E_TOKEN;
1251 tok_backup(tok, c);
1252 return ERRORTOKEN;
1253 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001255 if (c == 'l' || c == 'L')
1256 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 }
1258 else {
1259 /* Decimal */
1260 do {
1261 c = tok_nextc(tok);
1262 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001263 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001265 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001266 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001267 if (c == '.') {
1268 fraction:
1269 /* Fraction */
1270 do {
1271 c = tok_nextc(tok);
1272 } while (isdigit(c));
1273 }
1274 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001275 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001276 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001278 if (c == '+' || c == '-')
1279 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001280 if (!isdigit(c)) {
1281 tok->done = E_TOKEN;
1282 tok_backup(tok, c);
1283 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001284 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001285 do {
1286 c = tok_nextc(tok);
1287 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001289#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001290 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001291 /* Imaginary part */
1292 imaginary:
1293 c = tok_nextc(tok);
1294#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 }
1296 }
1297 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001298 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 *p_end = tok->cur;
1300 return NUMBER;
1301 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001302
1303 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001304 /* String */
1305 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001306 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001307 int quote = c;
1308 int triple = 0;
1309 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 for (;;) {
1311 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001312 if (c == '\n') {
1313 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001314 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001315 tok_backup(tok, c);
1316 return ERRORTOKEN;
1317 }
1318 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001319 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001320 }
1321 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001322 if (triple)
1323 tok->done = E_EOFS;
1324 else
1325 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001326 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 return ERRORTOKEN;
1328 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001329 else if (c == quote) {
1330 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001331 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001332 c = tok_nextc(tok);
1333 if (c == quote) {
1334 triple = 1;
1335 tripcount = 0;
1336 continue;
1337 }
1338 tok_backup(tok, c);
1339 }
1340 if (!triple || tripcount == 3)
1341 break;
1342 }
1343 else if (c == '\\') {
1344 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001346 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001347 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001348 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 return ERRORTOKEN;
1350 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001352 else
1353 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001355 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001356 *p_end = tok->cur;
1357 return STRING;
1358 }
1359
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 /* Line continuation */
1361 if (c == '\\') {
1362 c = tok_nextc(tok);
1363 if (c != '\n') {
1364 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001365 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 return ERRORTOKEN;
1367 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001368 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 goto again; /* Read next line */
1370 }
1371
Guido van Rossumfbab9051991-10-20 20:25:03 +00001372 /* Check for two-character token */
1373 {
1374 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001375 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001376 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001377 int c3 = tok_nextc(tok);
1378 int token3 = PyToken_ThreeChars(c, c2, c3);
1379 if (token3 != OP) {
1380 token = token3;
1381 } else {
1382 tok_backup(tok, c3);
1383 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001385 *p_end = tok->cur;
1386 return token;
1387 }
1388 tok_backup(tok, c2);
1389 }
1390
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001391 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001392 switch (c) {
1393 case '(':
1394 case '[':
1395 case '{':
1396 tok->level++;
1397 break;
1398 case ')':
1399 case ']':
1400 case '}':
1401 tok->level--;
1402 break;
1403 }
1404
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001406 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001408 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409}
1410
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001411int
1412PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1413{
1414 int result = tok_get(tok, p_start, p_end);
1415 if (tok->decoding_erred) {
1416 result = ERRORTOKEN;
1417 tok->done = E_DECODE;
1418 }
1419 return result;
1420}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421
Guido van Rossum408027e1996-12-30 16:17:54 +00001422#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423
1424void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001425tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426{
Guido van Rossum86bea461997-04-29 21:03:06 +00001427 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001428 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1429 printf("(%.*s)", (int)(end - start), start);
1430}
1431
1432#endif