blob: aaed637a2fdec2845a6b4f6089a5ae987fc7916e [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
172new_string(const char *s, int len)
173{
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
208get_coding_spec(const char *s, int size)
209{
210 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000231 while (isalnum((int)t[0]) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
255check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000279#else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
289 }
290 }
291 return r;
292}
293
294/* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
297
298static int
299check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
303{
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311#if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322#endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
326 }
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
333}
334
335/* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
337
338static char *
339fp_readl(char *s, int size, struct tok_state *tok)
340{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000341#ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000343 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000344 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000345#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
Mark Hammonda2e303c2003-01-14 23:15:22 +0000349 /* Ask for one less byte so we can terminate it */
350 PyObject *args = Py_BuildValue("(i)", size-1);
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000351 if (args == NULL)
352 return error_ret(tok);
353 buf = PyObject_Call(tok->decoding_readline, args, NULL);
354 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000355 if (buf == NULL)
356 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000357 } else {
358 tok->decoding_buffer = NULL;
359 }
360 utf8 = PyUnicode_AsUTF8String(buf);
361 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000362 if (utf8 == NULL)
363 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 else {
365 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000366 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367 strcpy(s, str);
368 Py_DECREF(utf8);
369 if (s[0] == '\0') return NULL; /* EOF */
370 return s;
371 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000372#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Set the readline function for TOK to a StreamReader's
376 readline function. The StreamReader is named ENC.
377
378 This function is called from check_bom and check_coding_spec.
379
380 ENC is usually identical to the future value of tok->encoding,
381 except for the (currently unsupported) case of UTF-16.
382
383 Return 1 on success, 0 on failure. */
384
385static int
386fp_setreadl(struct tok_state *tok, const char* enc)
387{
388 PyObject *reader, *stream, *readline;
389
Martin v. Löwis95292d62002-12-11 14:04:59 +0000390 /* XXX: constify filename argument. */
391 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000392 if (stream == NULL)
393 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394
395 reader = PyCodec_StreamReader(enc, stream, NULL);
396 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000397 if (reader == NULL)
398 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399
400 readline = PyObject_GetAttrString(reader, "readline");
401 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000402 if (readline == NULL)
403 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404
405 tok->decoding_readline = readline;
406 return 1;
407}
408
409/* Fetch the next byte from TOK. */
410
411static int fp_getc(struct tok_state *tok) {
412 return getc(tok->fp);
413}
414
415/* Unfetch the last byte back into TOK. */
416
417static void fp_ungetc(int c, struct tok_state *tok) {
418 ungetc(c, tok->fp);
419}
420
421/* Read a line of input from TOK. Determine encoding
422 if necessary. */
423
424static char *
425decoding_fgets(char *s, int size, struct tok_state *tok)
426{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000427 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430 if (tok->decoding_state < 0) {
431 /* We already have a codec associated with
432 this input. */
433 line = fp_readl(s, size, tok);
434 break;
435 } else if (tok->decoding_state > 0) {
436 /* We want a 'raw' read. */
437 line = Py_UniversalNewlineFgets(s, size,
438 tok->fp, NULL);
439 warn = 1;
440 break;
441 } else {
442 /* We have not yet determined the encoding.
443 If an encoding is found, use the file-pointer
444 reader functions from now on. */
445 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
446 return error_ret(tok);
447 assert(tok->decoding_state != 0);
448 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000449 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000450 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
451 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
452 return error_ret(tok);
453 }
454 }
455#ifndef PGEN
456 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
457 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000458 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459 if (*c > 127) {
460 badchar = *c;
461 break;
462 }
463 }
464 if (badchar) {
465 char buf[200];
466 sprintf(buf, "Non-ASCII character '\\x%.2x', "
467 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000468 /* Need to add 1 to the line number, since this line
469 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000471 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472 NULL, NULL);
473 tok->issued_encoding_warning = 1;
474 }
475#endif
476 return line;
477}
478
479static int
480decoding_feof(struct tok_state *tok)
481{
482 if (tok->decoding_state >= 0) {
483 return feof(tok->fp);
484 } else {
485 PyObject* buf = tok->decoding_buffer;
486 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000487 PyObject *args = PyTuple_New(0);
488 if (args == NULL) {
489 error_ret(tok);
490 return 1;
491 }
492 buf = PyObject_Call(tok->decoding_readline,
493 args, NULL);
494 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495 if (buf == NULL) {
496 error_ret(tok);
497 return 1;
498 } else {
499 tok->decoding_buffer = buf;
500 }
501 }
502 return PyObject_Length(buf) == 0;
503 }
504}
505
506/* Fetch a byte from TOK, using the string buffer. */
507
508static int buf_getc(struct tok_state *tok) {
509 return *tok->str++;
510}
511
512/* Unfetch a byte from TOK, using the string buffer. */
513
514static void buf_ungetc(int c, struct tok_state *tok) {
515 tok->str--;
516 assert(*tok->str == c); /* tok->cur may point to read-only segment */
517}
518
519/* Set the readline function for TOK to ENC. For the string-based
520 tokenizer, this means to just record the encoding. */
521
522static int buf_setreadl(struct tok_state *tok, const char* enc) {
523 tok->enc = enc;
524 return 1;
525}
526
527/* Return a UTF-8 encoding Python string object from the
528 C byte string STR, which is encoded with ENC. */
529
Martin v. Löwis019934b2002-08-07 12:33:18 +0000530#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531static PyObject *
532translate_into_utf8(const char* str, const char* enc) {
533 PyObject *utf8;
534 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
535 if (buf == NULL)
536 return NULL;
537 utf8 = PyUnicode_AsUTF8String(buf);
538 Py_DECREF(buf);
539 return utf8;
540}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000541#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542
543/* Decode a byte string STR for use as the buffer of TOK.
544 Look for encoding declarations inside STR, and record them
545 inside TOK. */
546
547static const char *
548decode_str(const char *str, struct tok_state *tok)
549{
550 PyObject* utf8 = NULL;
551 const char *s;
552 int lineno = 0;
553 tok->enc = NULL;
554 tok->str = str;
555 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
556 return NULL;
557 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000558 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000559#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 if (tok->enc != NULL) {
561 utf8 = translate_into_utf8(str, tok->enc);
562 if (utf8 == NULL)
563 return NULL;
564 str = PyString_AsString(utf8);
565 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567 for (s = str;; s++) {
568 if (*s == '\0') break;
569 else if (*s == '\n') {
570 lineno++;
571 if (lineno == 2) break;
572 }
573 }
574 tok->enc = NULL;
575 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
576 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000577#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 if (tok->enc != NULL) {
579 assert(utf8 == NULL);
580 utf8 = translate_into_utf8(str, tok->enc);
581 if (utf8 == NULL)
582 return NULL;
583 str = PyString_AsString(utf8);
584 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000585#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586 assert(tok->decoding_buffer == NULL);
587 tok->decoding_buffer = utf8; /* CAUTION */
588 return str;
589}
590
591#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000592
593/* Set up tokenizer for string */
594
595struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000596PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000597{
598 struct tok_state *tok = tok_new();
599 if (tok == NULL)
600 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 str = (char *)decode_str(str, tok);
602 if (str == NULL)
603 return NULL;
Martin v. Löwis95292d62002-12-11 14:04:59 +0000604 /* XXX: constify members. */
605 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606 return tok;
607}
608
609
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000610/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000611
612struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000613PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000614{
615 struct tok_state *tok = tok_new();
616 if (tok == NULL)
617 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000618 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
619 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000620 return NULL;
621 }
622 tok->cur = tok->inp = tok->buf;
623 tok->end = tok->buf + BUFSIZ;
624 tok->fp = fp;
625 tok->prompt = ps1;
626 tok->nextprompt = ps2;
627 return tok;
628}
629
630
631/* Free a tok_state structure */
632
633void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000634PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000635{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 if (tok->encoding != NULL)
637 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000638#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 Py_XDECREF(tok->decoding_readline);
640 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000641#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000643 PyMem_DEL(tok->buf);
644 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645}
646
647
648/* Get next char, updating state; error code goes into tok->done */
649
650static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000651tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000654 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000655 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000656 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000657 if (tok->done != E_OK)
658 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000660 char *end = strchr(tok->inp, '\n');
661 if (end != NULL)
662 end++;
663 else {
664 end = strchr(tok->inp, '\0');
665 if (end == tok->inp) {
666 tok->done = E_EOF;
667 return EOF;
668 }
669 }
670 if (tok->start == NULL)
671 tok->buf = tok->cur;
672 tok->lineno++;
673 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000674 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000676 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000677 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678 if (tok->nextprompt != NULL)
679 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000680 if (new == NULL)
681 tok->done = E_INTR;
682 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000683 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 tok->done = E_EOF;
685 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000686 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000687 size_t start = tok->start - tok->buf;
688 size_t oldlen = tok->cur - tok->buf;
689 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000690 char *buf = tok->buf;
691 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000692 tok->lineno++;
693 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000694 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000695 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000696 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000697 tok->done = E_NOMEM;
698 return EOF;
699 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000700 tok->buf = buf;
701 tok->cur = tok->buf + oldlen;
702 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000703 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000704 tok->inp = tok->buf + newlen;
705 tok->end = tok->inp + 1;
706 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000707 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000708 else {
709 tok->lineno++;
710 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000711 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000712 tok->buf = new;
713 tok->cur = tok->buf;
714 tok->inp = strchr(tok->buf, '\0');
715 tok->end = tok->inp + 1;
716 }
717 }
718 else {
719 int done = 0;
720 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000721 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000722 if (tok->start == NULL) {
723 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000724 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000725 if (tok->buf == NULL) {
726 tok->done = E_NOMEM;
727 return EOF;
728 }
729 tok->end = tok->buf + BUFSIZ;
730 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000731 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
732 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000733 tok->done = E_EOF;
734 done = 1;
735 }
736 else {
737 tok->done = E_OK;
738 tok->inp = strchr(tok->buf, '\0');
739 done = tok->inp[-1] == '\n';
740 }
741 }
742 else {
743 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000744 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000745 tok->done = E_EOF;
746 done = 1;
747 }
748 else
749 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000750 }
751 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000752 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000753 while (!done) {
754 int curstart = tok->start == NULL ? -1 :
755 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000756 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000757 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000758 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000759 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000760 if (newbuf == NULL) {
761 tok->done = E_NOMEM;
762 tok->cur = tok->inp;
763 return EOF;
764 }
765 tok->buf = newbuf;
766 tok->inp = tok->buf + curvalid;
767 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000768 tok->start = curstart < 0 ? NULL :
769 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000770 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000771 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000772 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000773 /* Last line does not end in \n,
774 fake one */
775 strcpy(tok->inp, "\n");
776 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000777 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000778 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000779 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000780 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000781#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000782 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000783 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000784 pt = tok->inp - 2;
785 if (pt >= tok->buf && *pt == '\r') {
786 *pt++ = '\n';
787 *pt = '\0';
788 tok->inp = pt;
789 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000790#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 }
792 if (tok->done != E_OK) {
793 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000794 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000795 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000796 return EOF;
797 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000799 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800}
801
802
803/* Back-up one character */
804
805static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000806tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807{
808 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000809 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000810 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 if (*tok->cur != c)
812 *tok->cur = c;
813 }
814}
815
816
817/* Return the token corresponding to a single character */
818
819int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000820PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821{
822 switch (c) {
823 case '(': return LPAR;
824 case ')': return RPAR;
825 case '[': return LSQB;
826 case ']': return RSQB;
827 case ':': return COLON;
828 case ',': return COMMA;
829 case ';': return SEMI;
830 case '+': return PLUS;
831 case '-': return MINUS;
832 case '*': return STAR;
833 case '/': return SLASH;
834 case '|': return VBAR;
835 case '&': return AMPER;
836 case '<': return LESS;
837 case '>': return GREATER;
838 case '=': return EQUAL;
839 case '.': return DOT;
840 case '%': return PERCENT;
841 case '`': return BACKQUOTE;
842 case '{': return LBRACE;
843 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000844 case '^': return CIRCUMFLEX;
845 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846 default: return OP;
847 }
848}
849
850
Guido van Rossumfbab9051991-10-20 20:25:03 +0000851int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000852PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000853{
854 switch (c1) {
855 case '=':
856 switch (c2) {
857 case '=': return EQEQUAL;
858 }
859 break;
860 case '!':
861 switch (c2) {
862 case '=': return NOTEQUAL;
863 }
864 break;
865 case '<':
866 switch (c2) {
867 case '>': return NOTEQUAL;
868 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000869 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000870 }
871 break;
872 case '>':
873 switch (c2) {
874 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000875 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000876 }
877 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000878 case '+':
879 switch (c2) {
880 case '=': return PLUSEQUAL;
881 }
882 break;
883 case '-':
884 switch (c2) {
885 case '=': return MINEQUAL;
886 }
887 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000888 case '*':
889 switch (c2) {
890 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000891 case '=': return STAREQUAL;
892 }
893 break;
894 case '/':
895 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000896 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000897 case '=': return SLASHEQUAL;
898 }
899 break;
900 case '|':
901 switch (c2) {
902 case '=': return VBAREQUAL;
903 }
904 break;
905 case '%':
906 switch (c2) {
907 case '=': return PERCENTEQUAL;
908 }
909 break;
910 case '&':
911 switch (c2) {
912 case '=': return AMPEREQUAL;
913 }
914 break;
915 case '^':
916 switch (c2) {
917 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000918 }
919 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000920 }
921 return OP;
922}
923
Thomas Wouters434d0822000-08-24 20:11:32 +0000924int
925PyToken_ThreeChars(int c1, int c2, int c3)
926{
927 switch (c1) {
928 case '<':
929 switch (c2) {
930 case '<':
931 switch (c3) {
932 case '=':
933 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000934 }
935 break;
936 }
937 break;
938 case '>':
939 switch (c2) {
940 case '>':
941 switch (c3) {
942 case '=':
943 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000944 }
945 break;
946 }
947 break;
948 case '*':
949 switch (c2) {
950 case '*':
951 switch (c3) {
952 case '=':
953 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000954 }
955 break;
956 }
957 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000958 case '/':
959 switch (c2) {
960 case '/':
961 switch (c3) {
962 case '=':
963 return DOUBLESLASHEQUAL;
964 }
965 break;
966 }
967 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000968 }
969 return OP;
970}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000971
Guido van Rossum926f13a1998-04-09 21:38:06 +0000972static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000973indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000974{
975 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000976 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000977 tok->cur = tok->inp;
978 return 1;
979 }
980 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000981 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
982 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000983 tok->altwarning = 0;
984 }
985 return 0;
986}
987
988
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000989/* Get next token, after space stripping etc. */
990
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000991static int
992tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000993{
994 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000995 int blankline;
996
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000997 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000998 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000999 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001000 blankline = 0;
1001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001002 /* Get indentation level */
1003 if (tok->atbol) {
1004 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001005 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001007 for (;;) {
1008 c = tok_nextc(tok);
1009 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001010 col++, altcol++;
1011 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001012 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001013 altcol = (altcol/tok->alttabsize + 1)
1014 * tok->alttabsize;
1015 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001016 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001017 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001018 else
1019 break;
1020 }
1021 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001022 if (c == '#' || c == '\n') {
1023 /* Lines with only whitespace and/or comments
1024 shouldn't affect the indentation and are
1025 not passed to the parser as NEWLINE tokens,
1026 except *totally* empty lines in interactive
1027 mode, which signal the end of a command group. */
1028 if (col == 0 && c == '\n' && tok->prompt != NULL)
1029 blankline = 0; /* Let it through */
1030 else
1031 blankline = 1; /* Ignore completely */
1032 /* We can't jump back right here since we still
1033 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001034 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001035 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001036 if (col == tok->indstack[tok->indent]) {
1037 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001038 if (altcol != tok->altindstack[tok->indent]) {
1039 if (indenterror(tok))
1040 return ERRORTOKEN;
1041 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001042 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001043 else if (col > tok->indstack[tok->indent]) {
1044 /* Indent -- always one */
1045 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001046 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001047 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001048 return ERRORTOKEN;
1049 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001050 if (altcol <= tok->altindstack[tok->indent]) {
1051 if (indenterror(tok))
1052 return ERRORTOKEN;
1053 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001054 tok->pendin++;
1055 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001056 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001058 else /* col < tok->indstack[tok->indent] */ {
1059 /* Dedent -- any number, must be consistent */
1060 while (tok->indent > 0 &&
1061 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001062 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001063 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001064 }
1065 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001066 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001067 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001068 return ERRORTOKEN;
1069 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001070 if (altcol != tok->altindstack[tok->indent]) {
1071 if (indenterror(tok))
1072 return ERRORTOKEN;
1073 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074 }
1075 }
1076 }
1077
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001078 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001079
1080 /* Return pending indents/dedents */
1081 if (tok->pendin != 0) {
1082 if (tok->pendin < 0) {
1083 tok->pendin++;
1084 return DEDENT;
1085 }
1086 else {
1087 tok->pendin--;
1088 return INDENT;
1089 }
1090 }
1091
1092 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001093 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001094 /* Skip spaces */
1095 do {
1096 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001097 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098
1099 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001100 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101
Guido van Rossumab5ca152000-03-31 00:52:27 +00001102 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001104 static char *tabforms[] = {
1105 "tab-width:", /* Emacs */
1106 ":tabstop=", /* vim, full form */
1107 ":ts=", /* vim, abbreviated form */
1108 "set tabsize=", /* will vi never die? */
1109 /* more templates can be added here to support other editors */
1110 };
1111 char cbuf[80];
1112 char *tp, **cp;
1113 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001115 *tp++ = c = tok_nextc(tok);
1116 } while (c != EOF && c != '\n' &&
1117 tp - cbuf + 1 < sizeof(cbuf));
1118 *tp = '\0';
1119 for (cp = tabforms;
1120 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1121 cp++) {
1122 if ((tp = strstr(cbuf, *cp))) {
1123 int newsize = atoi(tp + strlen(*cp));
1124
1125 if (newsize >= 1 && newsize <= 40) {
1126 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001127 if (Py_VerboseFlag)
1128 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001129 "Tab size set to %d\n",
1130 newsize);
1131 }
1132 }
1133 }
1134 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136 }
1137
1138 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001139 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001140 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001141 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142
1143 /* Identifier (most frequent token!) */
1144 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001145 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001146 switch (c) {
1147 case 'r':
1148 case 'R':
1149 c = tok_nextc(tok);
1150 if (c == '"' || c == '\'')
1151 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001152 break;
1153 case 'u':
1154 case 'U':
1155 c = tok_nextc(tok);
1156 if (c == 'r' || c == 'R')
1157 c = tok_nextc(tok);
1158 if (c == '"' || c == '\'')
1159 goto letter_quote;
1160 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001161 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001162 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001163 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001164 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001166 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001167 *p_end = tok->cur;
1168 return NAME;
1169 }
1170
1171 /* Newline */
1172 if (c == '\n') {
1173 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001174 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001175 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001176 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001178 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 return NEWLINE;
1180 }
1181
Guido van Rossum2d45be11997-04-11 19:16:25 +00001182#ifdef macintosh
1183 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001184 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001185 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001186 tok->done = E_TOKEN;
1187 tok->cur = tok->inp;
1188 return ERRORTOKEN;
1189 }
1190#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001191 /* Period or number starting with period? */
1192 if (c == '.') {
1193 c = tok_nextc(tok);
1194 if (isdigit(c)) {
1195 goto fraction;
1196 }
1197 else {
1198 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001199 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001200 *p_end = tok->cur;
1201 return DOT;
1202 }
1203 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001204
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 /* Number */
1206 if (isdigit(c)) {
1207 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001208 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 c = tok_nextc(tok);
1210 if (c == '.')
1211 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001212#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001213 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001214 goto imaginary;
1215#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216 if (c == 'x' || c == 'X') {
1217 /* Hex */
1218 do {
1219 c = tok_nextc(tok);
1220 } while (isxdigit(c));
1221 }
1222 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001223 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 /* Octal; c is first char of it */
1225 /* There's no 'isoctdigit' macro, sigh */
1226 while ('0' <= c && c < '8') {
1227 c = tok_nextc(tok);
1228 }
Tim Petersd507dab2001-08-30 20:51:59 +00001229 if (isdigit(c)) {
1230 found_decimal = 1;
1231 do {
1232 c = tok_nextc(tok);
1233 } while (isdigit(c));
1234 }
1235 if (c == '.')
1236 goto fraction;
1237 else if (c == 'e' || c == 'E')
1238 goto exponent;
1239#ifndef WITHOUT_COMPLEX
1240 else if (c == 'j' || c == 'J')
1241 goto imaginary;
1242#endif
1243 else if (found_decimal) {
1244 tok->done = E_TOKEN;
1245 tok_backup(tok, c);
1246 return ERRORTOKEN;
1247 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001249 if (c == 'l' || c == 'L')
1250 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 }
1252 else {
1253 /* Decimal */
1254 do {
1255 c = tok_nextc(tok);
1256 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001257 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001259 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001260 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001261 if (c == '.') {
1262 fraction:
1263 /* Fraction */
1264 do {
1265 c = tok_nextc(tok);
1266 } while (isdigit(c));
1267 }
1268 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001269 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001270 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001272 if (c == '+' || c == '-')
1273 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001274 if (!isdigit(c)) {
1275 tok->done = E_TOKEN;
1276 tok_backup(tok, c);
1277 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001278 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001279 do {
1280 c = tok_nextc(tok);
1281 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001283#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001284 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001285 /* Imaginary part */
1286 imaginary:
1287 c = tok_nextc(tok);
1288#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 }
1290 }
1291 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001292 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 *p_end = tok->cur;
1294 return NUMBER;
1295 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001296
1297 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001298 /* String */
1299 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001300 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001301 int quote = c;
1302 int triple = 0;
1303 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 for (;;) {
1305 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001306 if (c == '\n') {
1307 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001308 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001309 tok_backup(tok, c);
1310 return ERRORTOKEN;
1311 }
1312 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001313 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001314 }
1315 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001316 if (triple)
1317 tok->done = E_EOFS;
1318 else
1319 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001320 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 return ERRORTOKEN;
1322 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001323 else if (c == quote) {
1324 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001325 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001326 c = tok_nextc(tok);
1327 if (c == quote) {
1328 triple = 1;
1329 tripcount = 0;
1330 continue;
1331 }
1332 tok_backup(tok, c);
1333 }
1334 if (!triple || tripcount == 3)
1335 break;
1336 }
1337 else if (c == '\\') {
1338 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001340 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001341 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001342 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001343 return ERRORTOKEN;
1344 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001346 else
1347 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001349 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001350 *p_end = tok->cur;
1351 return STRING;
1352 }
1353
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 /* Line continuation */
1355 if (c == '\\') {
1356 c = tok_nextc(tok);
1357 if (c != '\n') {
1358 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001359 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 return ERRORTOKEN;
1361 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001362 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 goto again; /* Read next line */
1364 }
1365
Guido van Rossumfbab9051991-10-20 20:25:03 +00001366 /* Check for two-character token */
1367 {
1368 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001369 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001370 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001371 int c3 = tok_nextc(tok);
1372 int token3 = PyToken_ThreeChars(c, c2, c3);
1373 if (token3 != OP) {
1374 token = token3;
1375 } else {
1376 tok_backup(tok, c3);
1377 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001378 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001379 *p_end = tok->cur;
1380 return token;
1381 }
1382 tok_backup(tok, c2);
1383 }
1384
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001386 switch (c) {
1387 case '(':
1388 case '[':
1389 case '{':
1390 tok->level++;
1391 break;
1392 case ')':
1393 case ']':
1394 case '}':
1395 tok->level--;
1396 break;
1397 }
1398
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001400 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001402 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403}
1404
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001405int
1406PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1407{
1408 int result = tok_get(tok, p_start, p_end);
1409 if (tok->decoding_erred) {
1410 result = ERRORTOKEN;
1411 tok->done = E_DECODE;
1412 }
1413 return result;
1414}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415
Guido van Rossum408027e1996-12-30 16:17:54 +00001416#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001417
1418void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001419tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420{
Guido van Rossum86bea461997-04-29 21:03:06 +00001421 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1423 printf("(%.*s)", (int)(end - start), start);
1424}
1425
1426#endif