blob: 6957cc95932fdde5928f1b610e6176e203f33f1b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Guido van Rossum86bea461997-04-29 21:03:06 +0000108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
130 tok->issued_encoding_warning = 0;
131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_DEL(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
173new_string(const char *s, int len)
174{
175 char* result = PyMem_NEW(char, len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
209get_coding_spec(const char *s, int size)
210{
211 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 while (isalnum((int)t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 PyMem_DEL(r);
241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
256check_coding_spec(const char* line, int size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000280#else
281 /* Without Unicode support, we cannot
282 process the coding spec. Since there
283 won't be any Unicode literals, that
284 won't matter. */
285#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 } else { /* then, compare cs with BOM */
288 r = (strcmp(tok->encoding, cs) == 0);
289 PyMem_DEL(cs);
290 }
291 }
292 return r;
293}
294
295/* See whether the file starts with a BOM. If it does,
296 invoke the set_readline function with the new encoding.
297 Return 1 on success, 0 on failure. */
298
299static int
300check_bom(int get_char(struct tok_state *),
301 void unget_char(int, struct tok_state *),
302 int set_readline(struct tok_state *, const char *),
303 struct tok_state *tok)
304{
305 int ch = get_char(tok);
306 tok->decoding_state = 1;
307 if (ch == EOF) {
308 return 1;
309 } else if (ch == 0xEF) {
310 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
311 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
312#if 0
313 /* Disable support for UTF-16 BOMs until a decision
314 is made whether this needs to be supported. */
315 } else if (ch == 0xFE) {
316 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
317 if (!set_readline(tok, "utf-16-be")) return 0;
318 tok->decoding_state = -1;
319 } else if (ch == 0xFF) {
320 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
321 if (!set_readline(tok, "utf-16-le")) return 0;
322 tok->decoding_state = -1;
323#endif
324 } else {
325 unget_char(ch, tok);
326 return 1;
327 }
328 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
329 return 1;
330 NON_BOM:
331 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
332 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
333 return 1;
334}
335
336/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000337 Return NULL on failure, else S.
338
339 On entry, tok->decoding_buffer will be one of:
340 1) NULL: need to call tok->decoding_readline to get a new line
341 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
342 stored the result in tok->decoding_buffer
343 3) PyStringObject *: previous call to fp_readl did not have enough room
344 (in the s buffer) to copy entire contents of the line read
345 by tok->decoding_readline. tok->decoding_buffer has the overflow.
346 In this case, fp_readl is called in a loop (with an expanded buffer)
347 until the buffer ends with a '\n' (or until the end of the file is
348 reached): see tok_nextc and its calls to decoding_fgets.
349*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350
351static char *
352fp_readl(char *s, int size, struct tok_state *tok)
353{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000354#ifndef Py_USING_UNICODE
355 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000356 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000357 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000358#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000360 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 char *str;
362 int utf8len;
363
364 /* Ask for one less byte so we can terminate it */
365 assert(size > 0);
366 size--;
367
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000370 if (buf == NULL)
371 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 } else {
373 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374 if (PyString_CheckExact(buf))
375 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000376 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377 if (utf8 == NULL) {
378 utf8 = PyUnicode_AsUTF8String(buf);
379 Py_DECREF(buf);
380 if (utf8 == NULL)
381 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 str = PyString_AsString(utf8);
384 utf8len = PyString_GET_SIZE(utf8);
385 if (utf8len > size) {
386 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
387 if (tok->decoding_buffer == NULL) {
388 Py_DECREF(utf8);
389 return error_ret(tok);
390 }
391 utf8len = size;
392 }
393 memcpy(s, str, utf8len);
394 s[utf8len] = '\0';
395 Py_DECREF(utf8);
396 if (utf8len == 0) return NULL; /* EOF */
397 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000398#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399}
400
401/* Set the readline function for TOK to a StreamReader's
402 readline function. The StreamReader is named ENC.
403
404 This function is called from check_bom and check_coding_spec.
405
406 ENC is usually identical to the future value of tok->encoding,
407 except for the (currently unsupported) case of UTF-16.
408
409 Return 1 on success, 0 on failure. */
410
411static int
412fp_setreadl(struct tok_state *tok, const char* enc)
413{
414 PyObject *reader, *stream, *readline;
415
Martin v. Löwis95292d62002-12-11 14:04:59 +0000416 /* XXX: constify filename argument. */
417 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000418 if (stream == NULL)
419 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420
421 reader = PyCodec_StreamReader(enc, stream, NULL);
422 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000423 if (reader == NULL)
424 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426 readline = PyObject_GetAttrString(reader, "readline");
427 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 if (readline == NULL)
429 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430
431 tok->decoding_readline = readline;
432 return 1;
433}
434
435/* Fetch the next byte from TOK. */
436
437static int fp_getc(struct tok_state *tok) {
438 return getc(tok->fp);
439}
440
441/* Unfetch the last byte back into TOK. */
442
443static void fp_ungetc(int c, struct tok_state *tok) {
444 ungetc(c, tok->fp);
445}
446
447/* Read a line of input from TOK. Determine encoding
448 if necessary. */
449
450static char *
451decoding_fgets(char *s, int size, struct tok_state *tok)
452{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000453 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000455 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000456 if (tok->decoding_state < 0) {
457 /* We already have a codec associated with
458 this input. */
459 line = fp_readl(s, size, tok);
460 break;
461 } else if (tok->decoding_state > 0) {
462 /* We want a 'raw' read. */
463 line = Py_UniversalNewlineFgets(s, size,
464 tok->fp, NULL);
465 warn = 1;
466 break;
467 } else {
468 /* We have not yet determined the encoding.
469 If an encoding is found, use the file-pointer
470 reader functions from now on. */
471 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
472 return error_ret(tok);
473 assert(tok->decoding_state != 0);
474 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000475 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
477 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
478 return error_ret(tok);
479 }
480 }
481#ifndef PGEN
482 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
483 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000484 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485 if (*c > 127) {
486 badchar = *c;
487 break;
488 }
489 }
490 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000491 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000492 /* Need to add 1 to the line number, since this line
493 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000494 sprintf(buf,
495 "Non-ASCII character '\\x%.2x' "
496 "in file %.200s on line %i, "
497 "but no encoding declared; "
498 "see http://www.python.org/peps/pep-0263.html for details",
499 badchar, tok->filename, tok->lineno + 1);
500 /* We don't use PyErr_WarnExplicit() here because
501 printing the line in question to e.g. a log file
502 could result in sensitive information being
503 exposed. */
504 PyErr_Warn(PyExc_DeprecationWarning, buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505 tok->issued_encoding_warning = 1;
506 }
507#endif
508 return line;
509}
510
511static int
512decoding_feof(struct tok_state *tok)
513{
514 if (tok->decoding_state >= 0) {
515 return feof(tok->fp);
516 } else {
517 PyObject* buf = tok->decoding_buffer;
518 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000519 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000520 if (buf == NULL) {
521 error_ret(tok);
522 return 1;
523 } else {
524 tok->decoding_buffer = buf;
525 }
526 }
527 return PyObject_Length(buf) == 0;
528 }
529}
530
531/* Fetch a byte from TOK, using the string buffer. */
532
533static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000534 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535}
536
537/* Unfetch a byte from TOK, using the string buffer. */
538
539static void buf_ungetc(int c, struct tok_state *tok) {
540 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000541 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542}
543
544/* Set the readline function for TOK to ENC. For the string-based
545 tokenizer, this means to just record the encoding. */
546
547static int buf_setreadl(struct tok_state *tok, const char* enc) {
548 tok->enc = enc;
549 return 1;
550}
551
552/* Return a UTF-8 encoding Python string object from the
553 C byte string STR, which is encoded with ENC. */
554
Martin v. Löwis019934b2002-08-07 12:33:18 +0000555#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556static PyObject *
557translate_into_utf8(const char* str, const char* enc) {
558 PyObject *utf8;
559 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
560 if (buf == NULL)
561 return NULL;
562 utf8 = PyUnicode_AsUTF8String(buf);
563 Py_DECREF(buf);
564 return utf8;
565}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567
568/* Decode a byte string STR for use as the buffer of TOK.
569 Look for encoding declarations inside STR, and record them
570 inside TOK. */
571
572static const char *
573decode_str(const char *str, struct tok_state *tok)
574{
575 PyObject* utf8 = NULL;
576 const char *s;
577 int lineno = 0;
578 tok->enc = NULL;
579 tok->str = str;
580 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
581 return NULL;
582 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000583 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000584#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 if (tok->enc != NULL) {
586 utf8 = translate_into_utf8(str, tok->enc);
587 if (utf8 == NULL)
588 return NULL;
589 str = PyString_AsString(utf8);
590 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000591#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 for (s = str;; s++) {
593 if (*s == '\0') break;
594 else if (*s == '\n') {
595 lineno++;
596 if (lineno == 2) break;
597 }
598 }
599 tok->enc = NULL;
600 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
601 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000602#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 if (tok->enc != NULL) {
604 assert(utf8 == NULL);
605 utf8 = translate_into_utf8(str, tok->enc);
606 if (utf8 == NULL)
607 return NULL;
608 str = PyString_AsString(utf8);
609 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000610#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611 assert(tok->decoding_buffer == NULL);
612 tok->decoding_buffer = utf8; /* CAUTION */
613 return str;
614}
615
616#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617
618/* Set up tokenizer for string */
619
620struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000621PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000622{
623 struct tok_state *tok = tok_new();
624 if (tok == NULL)
625 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626 str = (char *)decode_str(str, tok);
627 if (str == NULL)
628 return NULL;
Martin v. Löwis95292d62002-12-11 14:04:59 +0000629 /* XXX: constify members. */
630 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 return tok;
632}
633
634
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000635/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636
637struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000638PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000639{
640 struct tok_state *tok = tok_new();
641 if (tok == NULL)
642 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000643 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
644 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645 return NULL;
646 }
647 tok->cur = tok->inp = tok->buf;
648 tok->end = tok->buf + BUFSIZ;
649 tok->fp = fp;
650 tok->prompt = ps1;
651 tok->nextprompt = ps2;
652 return tok;
653}
654
655
656/* Free a tok_state structure */
657
658void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000659PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 if (tok->encoding != NULL)
662 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000663#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664 Py_XDECREF(tok->decoding_readline);
665 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000666#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000668 PyMem_DEL(tok->buf);
669 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000670}
671
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000672#if !defined(PGEN) && defined(Py_USING_UNICODE)
673static int
674tok_stdin_decode(struct tok_state *tok, char **inp)
675{
676 PyObject *enc, *sysstdin, *decoded, *utf8;
677 const char *encoding;
678 char *converted;
679
680 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
681 return 0;
682 sysstdin = PySys_GetObject("stdin");
683 if (sysstdin == NULL || !PyFile_Check(sysstdin))
684 return 0;
685
686 enc = ((PyFileObject *)sysstdin)->f_encoding;
687 if (enc == NULL || !PyString_Check(enc))
688 return 0;
689 Py_INCREF(enc);
690
691 encoding = PyString_AsString(enc);
692 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
693 if (decoded == NULL)
694 goto error_clear;
695
696 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
697 Py_DECREF(decoded);
698 if (utf8 == NULL)
699 goto error_clear;
700
701 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
702 Py_DECREF(utf8);
703 if (converted == NULL)
704 goto error_nomem;
705
706 PyMem_FREE(*inp);
707 *inp = converted;
708 if (tok->encoding != NULL)
709 PyMem_DEL(tok->encoding);
710 tok->encoding = new_string(encoding, strlen(encoding));
711 if (tok->encoding == NULL)
712 goto error_nomem;
713
714 Py_DECREF(enc);
715 return 0;
716
717error_nomem:
718 Py_DECREF(enc);
719 tok->done = E_NOMEM;
720 return -1;
721
722error_clear:
723 /* Fallback to iso-8859-1: for backward compatibility */
724 Py_DECREF(enc);
725 PyErr_Clear();
726 return 0;
727}
728#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729
730/* Get next char, updating state; error code goes into tok->done */
731
732static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000733tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000734{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000736 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000737 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000738 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000739 if (tok->done != E_OK)
740 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000742 char *end = strchr(tok->inp, '\n');
743 if (end != NULL)
744 end++;
745 else {
746 end = strchr(tok->inp, '\0');
747 if (end == tok->inp) {
748 tok->done = E_EOF;
749 return EOF;
750 }
751 }
752 if (tok->start == NULL)
753 tok->buf = tok->cur;
754 tok->lineno++;
755 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000756 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000759 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 if (tok->nextprompt != NULL)
761 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000762 if (new == NULL)
763 tok->done = E_INTR;
764 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000765 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766 tok->done = E_EOF;
767 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000768#if !defined(PGEN) && defined(Py_USING_UNICODE)
769 else if (tok_stdin_decode(tok, &new) != 0)
770 PyMem_FREE(new);
771#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000772 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000773 size_t start = tok->start - tok->buf;
774 size_t oldlen = tok->cur - tok->buf;
775 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000776 char *buf = tok->buf;
777 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000778 tok->lineno++;
779 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000780 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000781 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000782 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000783 tok->done = E_NOMEM;
784 return EOF;
785 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000786 tok->buf = buf;
787 tok->cur = tok->buf + oldlen;
788 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000789 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000790 tok->inp = tok->buf + newlen;
791 tok->end = tok->inp + 1;
792 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000793 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 else {
795 tok->lineno++;
796 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000797 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000798 tok->buf = new;
799 tok->cur = tok->buf;
800 tok->inp = strchr(tok->buf, '\0');
801 tok->end = tok->inp + 1;
802 }
803 }
804 else {
805 int done = 0;
806 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000807 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 if (tok->start == NULL) {
809 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000810 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 if (tok->buf == NULL) {
812 tok->done = E_NOMEM;
813 return EOF;
814 }
815 tok->end = tok->buf + BUFSIZ;
816 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000817 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
818 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->done = E_EOF;
820 done = 1;
821 }
822 else {
823 tok->done = E_OK;
824 tok->inp = strchr(tok->buf, '\0');
825 done = tok->inp[-1] == '\n';
826 }
827 }
828 else {
829 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000830 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000831 tok->done = E_EOF;
832 done = 1;
833 }
834 else
835 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 }
837 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000838 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 while (!done) {
840 int curstart = tok->start == NULL ? -1 :
841 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000842 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000843 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000844 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000845 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000846 if (newbuf == NULL) {
847 tok->done = E_NOMEM;
848 tok->cur = tok->inp;
849 return EOF;
850 }
851 tok->buf = newbuf;
852 tok->inp = tok->buf + curvalid;
853 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 tok->start = curstart < 0 ? NULL :
855 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000856 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000857 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000858 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000859 /* Last line does not end in \n,
860 fake one */
861 strcpy(tok->inp, "\n");
862 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000866 tok->cur = tok->buf + cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000867 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000868 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000869 pt = tok->inp - 2;
870 if (pt >= tok->buf && *pt == '\r') {
871 *pt++ = '\n';
872 *pt = '\0';
873 tok->inp = pt;
874 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875 }
876 if (tok->done != E_OK) {
877 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000878 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000879 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000880 return EOF;
881 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000883 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000884}
885
886
887/* Back-up one character */
888
889static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000890tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000891{
892 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000893 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000894 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000895 if (*tok->cur != c)
896 *tok->cur = c;
897 }
898}
899
900
901/* Return the token corresponding to a single character */
902
903int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000904PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905{
906 switch (c) {
907 case '(': return LPAR;
908 case ')': return RPAR;
909 case '[': return LSQB;
910 case ']': return RSQB;
911 case ':': return COLON;
912 case ',': return COMMA;
913 case ';': return SEMI;
914 case '+': return PLUS;
915 case '-': return MINUS;
916 case '*': return STAR;
917 case '/': return SLASH;
918 case '|': return VBAR;
919 case '&': return AMPER;
920 case '<': return LESS;
921 case '>': return GREATER;
922 case '=': return EQUAL;
923 case '.': return DOT;
924 case '%': return PERCENT;
925 case '`': return BACKQUOTE;
926 case '{': return LBRACE;
927 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000928 case '^': return CIRCUMFLEX;
929 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000930 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000931 default: return OP;
932 }
933}
934
935
Guido van Rossumfbab9051991-10-20 20:25:03 +0000936int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000937PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000938{
939 switch (c1) {
940 case '=':
941 switch (c2) {
942 case '=': return EQEQUAL;
943 }
944 break;
945 case '!':
946 switch (c2) {
947 case '=': return NOTEQUAL;
948 }
949 break;
950 case '<':
951 switch (c2) {
952 case '>': return NOTEQUAL;
953 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000954 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000955 }
956 break;
957 case '>':
958 switch (c2) {
959 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000960 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000961 }
962 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000963 case '+':
964 switch (c2) {
965 case '=': return PLUSEQUAL;
966 }
967 break;
968 case '-':
969 switch (c2) {
970 case '=': return MINEQUAL;
971 }
972 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000973 case '*':
974 switch (c2) {
975 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000976 case '=': return STAREQUAL;
977 }
978 break;
979 case '/':
980 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000981 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000982 case '=': return SLASHEQUAL;
983 }
984 break;
985 case '|':
986 switch (c2) {
987 case '=': return VBAREQUAL;
988 }
989 break;
990 case '%':
991 switch (c2) {
992 case '=': return PERCENTEQUAL;
993 }
994 break;
995 case '&':
996 switch (c2) {
997 case '=': return AMPEREQUAL;
998 }
999 break;
1000 case '^':
1001 switch (c2) {
1002 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001003 }
1004 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001005 }
1006 return OP;
1007}
1008
Thomas Wouters434d0822000-08-24 20:11:32 +00001009int
1010PyToken_ThreeChars(int c1, int c2, int c3)
1011{
1012 switch (c1) {
1013 case '<':
1014 switch (c2) {
1015 case '<':
1016 switch (c3) {
1017 case '=':
1018 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 }
1020 break;
1021 }
1022 break;
1023 case '>':
1024 switch (c2) {
1025 case '>':
1026 switch (c3) {
1027 case '=':
1028 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001029 }
1030 break;
1031 }
1032 break;
1033 case '*':
1034 switch (c2) {
1035 case '*':
1036 switch (c3) {
1037 case '=':
1038 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001039 }
1040 break;
1041 }
1042 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001043 case '/':
1044 switch (c2) {
1045 case '/':
1046 switch (c3) {
1047 case '=':
1048 return DOUBLESLASHEQUAL;
1049 }
1050 break;
1051 }
1052 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001053 }
1054 return OP;
1055}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001056
Guido van Rossum926f13a1998-04-09 21:38:06 +00001057static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001058indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001059{
1060 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001061 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001062 tok->cur = tok->inp;
1063 return 1;
1064 }
1065 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001066 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1067 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001068 tok->altwarning = 0;
1069 }
1070 return 0;
1071}
1072
1073
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074/* Get next token, after space stripping etc. */
1075
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001076static int
1077tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001078{
1079 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001080 int blankline;
1081
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001082 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001083 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001084 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001085 blankline = 0;
1086
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087 /* Get indentation level */
1088 if (tok->atbol) {
1089 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001090 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001092 for (;;) {
1093 c = tok_nextc(tok);
1094 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095 col++, altcol++;
1096 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001098 altcol = (altcol/tok->alttabsize + 1)
1099 * tok->alttabsize;
1100 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001101 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001102 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103 else
1104 break;
1105 }
1106 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001107 if (c == '#' || c == '\n') {
1108 /* Lines with only whitespace and/or comments
1109 shouldn't affect the indentation and are
1110 not passed to the parser as NEWLINE tokens,
1111 except *totally* empty lines in interactive
1112 mode, which signal the end of a command group. */
1113 if (col == 0 && c == '\n' && tok->prompt != NULL)
1114 blankline = 0; /* Let it through */
1115 else
1116 blankline = 1; /* Ignore completely */
1117 /* We can't jump back right here since we still
1118 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001120 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121 if (col == tok->indstack[tok->indent]) {
1122 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 if (altcol != tok->altindstack[tok->indent]) {
1124 if (indenterror(tok))
1125 return ERRORTOKEN;
1126 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001128 else if (col > tok->indstack[tok->indent]) {
1129 /* Indent -- always one */
1130 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001131 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001132 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001133 return ERRORTOKEN;
1134 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001135 if (altcol <= tok->altindstack[tok->indent]) {
1136 if (indenterror(tok))
1137 return ERRORTOKEN;
1138 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001139 tok->pendin++;
1140 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001141 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 else /* col < tok->indstack[tok->indent] */ {
1144 /* Dedent -- any number, must be consistent */
1145 while (tok->indent > 0 &&
1146 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001147 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001148 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001149 }
1150 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001151 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001152 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 return ERRORTOKEN;
1154 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155 if (altcol != tok->altindstack[tok->indent]) {
1156 if (indenterror(tok))
1157 return ERRORTOKEN;
1158 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 }
1160 }
1161 }
1162
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001163 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164
1165 /* Return pending indents/dedents */
1166 if (tok->pendin != 0) {
1167 if (tok->pendin < 0) {
1168 tok->pendin++;
1169 return DEDENT;
1170 }
1171 else {
1172 tok->pendin--;
1173 return INDENT;
1174 }
1175 }
1176
1177 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001178 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 /* Skip spaces */
1180 do {
1181 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001182 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001183
1184 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001185 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001186
Guido van Rossumab5ca152000-03-31 00:52:27 +00001187 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001188 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001189 static char *tabforms[] = {
1190 "tab-width:", /* Emacs */
1191 ":tabstop=", /* vim, full form */
1192 ":ts=", /* vim, abbreviated form */
1193 "set tabsize=", /* will vi never die? */
1194 /* more templates can be added here to support other editors */
1195 };
1196 char cbuf[80];
1197 char *tp, **cp;
1198 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001200 *tp++ = c = tok_nextc(tok);
1201 } while (c != EOF && c != '\n' &&
1202 tp - cbuf + 1 < sizeof(cbuf));
1203 *tp = '\0';
1204 for (cp = tabforms;
1205 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1206 cp++) {
1207 if ((tp = strstr(cbuf, *cp))) {
1208 int newsize = atoi(tp + strlen(*cp));
1209
1210 if (newsize >= 1 && newsize <= 40) {
1211 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001212 if (Py_VerboseFlag)
1213 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001214 "Tab size set to %d\n",
1215 newsize);
1216 }
1217 }
1218 }
1219 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 }
1222
1223 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001224 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001225 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001226 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227
1228 /* Identifier (most frequent token!) */
1229 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001230 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001231 switch (c) {
1232 case 'r':
1233 case 'R':
1234 c = tok_nextc(tok);
1235 if (c == '"' || c == '\'')
1236 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001237 break;
1238 case 'u':
1239 case 'U':
1240 c = tok_nextc(tok);
1241 if (c == 'r' || c == 'R')
1242 c = tok_nextc(tok);
1243 if (c == '"' || c == '\'')
1244 goto letter_quote;
1245 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001246 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001247 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001249 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001251 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001252 *p_end = tok->cur;
1253 return NAME;
1254 }
1255
1256 /* Newline */
1257 if (c == '\n') {
1258 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001259 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001260 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001261 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001262 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001263 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 return NEWLINE;
1265 }
1266
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001267 /* Period or number starting with period? */
1268 if (c == '.') {
1269 c = tok_nextc(tok);
1270 if (isdigit(c)) {
1271 goto fraction;
1272 }
1273 else {
1274 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001275 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001276 *p_end = tok->cur;
1277 return DOT;
1278 }
1279 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001280
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 /* Number */
1282 if (isdigit(c)) {
1283 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001284 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 c = tok_nextc(tok);
1286 if (c == '.')
1287 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001288#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001289 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001290 goto imaginary;
1291#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 if (c == 'x' || c == 'X') {
1293 /* Hex */
1294 do {
1295 c = tok_nextc(tok);
1296 } while (isxdigit(c));
1297 }
1298 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001299 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 /* Octal; c is first char of it */
1301 /* There's no 'isoctdigit' macro, sigh */
1302 while ('0' <= c && c < '8') {
1303 c = tok_nextc(tok);
1304 }
Tim Petersd507dab2001-08-30 20:51:59 +00001305 if (isdigit(c)) {
1306 found_decimal = 1;
1307 do {
1308 c = tok_nextc(tok);
1309 } while (isdigit(c));
1310 }
1311 if (c == '.')
1312 goto fraction;
1313 else if (c == 'e' || c == 'E')
1314 goto exponent;
1315#ifndef WITHOUT_COMPLEX
1316 else if (c == 'j' || c == 'J')
1317 goto imaginary;
1318#endif
1319 else if (found_decimal) {
1320 tok->done = E_TOKEN;
1321 tok_backup(tok, c);
1322 return ERRORTOKEN;
1323 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001325 if (c == 'l' || c == 'L')
1326 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 }
1328 else {
1329 /* Decimal */
1330 do {
1331 c = tok_nextc(tok);
1332 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001333 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001335 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001336 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001337 if (c == '.') {
1338 fraction:
1339 /* Fraction */
1340 do {
1341 c = tok_nextc(tok);
1342 } while (isdigit(c));
1343 }
1344 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001345 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001346 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001348 if (c == '+' || c == '-')
1349 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001350 if (!isdigit(c)) {
1351 tok->done = E_TOKEN;
1352 tok_backup(tok, c);
1353 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001354 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001355 do {
1356 c = tok_nextc(tok);
1357 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001359#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001360 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001361 /* Imaginary part */
1362 imaginary:
1363 c = tok_nextc(tok);
1364#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 }
1366 }
1367 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001368 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 *p_end = tok->cur;
1370 return NUMBER;
1371 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001372
1373 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001374 /* String */
1375 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001376 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001377 int quote = c;
1378 int triple = 0;
1379 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 for (;;) {
1381 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 if (c == '\n') {
1383 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001384 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 tok_backup(tok, c);
1386 return ERRORTOKEN;
1387 }
1388 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001389 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001390 }
1391 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001392 if (triple)
1393 tok->done = E_EOFS;
1394 else
1395 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001396 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 return ERRORTOKEN;
1398 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 else if (c == quote) {
1400 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001401 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001402 c = tok_nextc(tok);
1403 if (c == quote) {
1404 triple = 1;
1405 tripcount = 0;
1406 continue;
1407 }
1408 tok_backup(tok, c);
1409 }
1410 if (!triple || tripcount == 3)
1411 break;
1412 }
1413 else if (c == '\\') {
1414 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001416 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001417 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001418 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 return ERRORTOKEN;
1420 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001422 else
1423 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001425 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001426 *p_end = tok->cur;
1427 return STRING;
1428 }
1429
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 /* Line continuation */
1431 if (c == '\\') {
1432 c = tok_nextc(tok);
1433 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001434 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001435 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001436 return ERRORTOKEN;
1437 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001438 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001439 goto again; /* Read next line */
1440 }
1441
Guido van Rossumfbab9051991-10-20 20:25:03 +00001442 /* Check for two-character token */
1443 {
1444 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001445 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001446 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001447 int c3 = tok_nextc(tok);
1448 int token3 = PyToken_ThreeChars(c, c2, c3);
1449 if (token3 != OP) {
1450 token = token3;
1451 } else {
1452 tok_backup(tok, c3);
1453 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001454 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001455 *p_end = tok->cur;
1456 return token;
1457 }
1458 tok_backup(tok, c2);
1459 }
1460
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001461 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001462 switch (c) {
1463 case '(':
1464 case '[':
1465 case '{':
1466 tok->level++;
1467 break;
1468 case ')':
1469 case ']':
1470 case '}':
1471 tok->level--;
1472 break;
1473 }
1474
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001475 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001476 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001478 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479}
1480
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001481int
1482PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1483{
1484 int result = tok_get(tok, p_start, p_end);
1485 if (tok->decoding_erred) {
1486 result = ERRORTOKEN;
1487 tok->done = E_DECODE;
1488 }
1489 return result;
1490}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001491
Guido van Rossum408027e1996-12-30 16:17:54 +00001492#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001493
1494void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001495tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001496{
Guido van Rossum86bea461997-04-29 21:03:06 +00001497 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1499 printf("(%.*s)", (int)(end - start), start);
1500}
1501
1502#endif