blob: a08f1838af36642486cedcced5fa3a609f778d93 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
103 if (tok == NULL)
104 return NULL;
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000130 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000138 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000144 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000150 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160 PyMem_FREE(tok->buf);
161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
176static char *
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000177get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000202get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000204 Py_ssize_t i;
205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000223
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000224 begin = t;
225 while (isalnum(Py_CHARMASK(t[0])) ||
226 t[0] == '-' || t[0] == '_' || t[0] == '.')
227 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
233 PyMem_FREE(r);
234 r = new_string(q, strlen(q));
235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000250 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000252 char * cs;
253 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000254
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
258 cs = get_coding_spec(line, size);
259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000267#ifdef Py_USING_UNICODE
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
273 else
274 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#else
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
285 PyMem_FREE(cs);
286 }
287 }
288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
294 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000306{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000307 int ch1, ch2, ch3;
308 ch1 = get_char(tok);
309 tok->decoding_state = 1;
310 if (ch1 == EOF) {
311 return 1;
312 } else if (ch1 == 0xEF) {
313 ch2 = get_char(tok);
314 if (ch2 != 0xBB) {
315 unget_char(ch2, tok);
316 unget_char(ch1, tok);
317 return 1;
318 }
319 ch3 = get_char(tok);
320 if (ch3 != 0xBF) {
321 unget_char(ch3, tok);
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326#if 0
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000327 /* Disable support for UTF-16 BOMs until a decision
328 is made whether this needs to be supported. */
329 } else if (ch1 == 0xFE) {
330 ch2 = get_char(tok);
331 if (ch2 != 0xFF) {
332 unget_char(ch2, tok);
333 unget_char(ch1, tok);
334 return 1;
335 }
336 if (!set_readline(tok, "utf-16-be"))
337 return 0;
338 tok->decoding_state = -1;
339 } else if (ch1 == 0xFF) {
340 ch2 = get_char(tok);
341 if (ch2 != 0xFE) {
342 unget_char(ch2, tok);
343 unget_char(ch1, tok);
344 return 1;
345 }
346 if (!set_readline(tok, "utf-16-le"))
347 return 0;
348 tok->decoding_state = -1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000350 } else {
351 unget_char(ch1, tok);
352 return 1;
353 }
354 if (tok->encoding != NULL)
355 PyMem_FREE(tok->encoding);
356 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
357 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358}
359
360/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000362
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 On entry, tok->decoding_buffer will be one of:
364 1) NULL: need to call tok->decoding_readline to get a new line
365 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000366 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000367 3) PyStringObject *: previous call to fp_readl did not have enough room
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000368 (in the s buffer) to copy entire contents of the line read
369 by tok->decoding_readline. tok->decoding_buffer has the overflow.
370 In this case, fp_readl is called in a loop (with an expanded buffer)
371 until the buffer ends with a '\n' (or until the end of the file is
372 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374
375static char *
376fp_readl(char *s, int size, struct tok_state *tok)
377{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000378#ifndef Py_USING_UNICODE
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000379 /* In a non-Unicode built, this should never be called. */
380 Py_FatalError("fp_readl should not be called in this build.");
381 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000382#else
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000383 PyObject* utf8 = NULL;
384 PyObject* buf = tok->decoding_buffer;
385 char *str;
386 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000388 /* Ask for one less byte so we can terminate it */
389 assert(size > 0);
390 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000392 if (buf == NULL) {
393 buf = PyObject_CallObject(tok->decoding_readline, NULL);
394 if (buf == NULL)
395 return error_ret(tok);
396 } else {
397 tok->decoding_buffer = NULL;
398 if (PyString_CheckExact(buf))
399 utf8 = buf;
400 }
401 if (utf8 == NULL) {
402 utf8 = PyUnicode_AsUTF8String(buf);
403 Py_DECREF(buf);
404 if (utf8 == NULL)
405 return error_ret(tok);
406 }
407 str = PyString_AsString(utf8);
408 utf8len = PyString_GET_SIZE(utf8);
409 if (utf8len > size) {
410 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
411 if (tok->decoding_buffer == NULL) {
412 Py_DECREF(utf8);
413 return error_ret(tok);
414 }
415 utf8len = size;
416 }
417 memcpy(s, str, utf8len);
418 s[utf8len] = '\0';
419 Py_DECREF(utf8);
420 if (utf8len == 0) return NULL; /* EOF */
421 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000422#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423}
424
425/* Set the readline function for TOK to a StreamReader's
426 readline function. The StreamReader is named ENC.
427
428 This function is called from check_bom and check_coding_spec.
429
430 ENC is usually identical to the future value of tok->encoding,
431 except for the (currently unsupported) case of UTF-16.
432
433 Return 1 on success, 0 on failure. */
434
435static int
436fp_setreadl(struct tok_state *tok, const char* enc)
437{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000438 PyObject *reader, *stream, *readline;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000439
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000440 /* XXX: constify filename argument. */
441 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
442 if (stream == NULL)
443 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000445 reader = PyCodec_StreamReader(enc, stream, NULL);
446 Py_DECREF(stream);
447 if (reader == NULL)
448 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000450 readline = PyObject_GetAttrString(reader, "readline");
451 Py_DECREF(reader);
452 if (readline == NULL)
453 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000455 tok->decoding_readline = readline;
456 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000457}
458
459/* Fetch the next byte from TOK. */
460
461static int fp_getc(struct tok_state *tok) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000462 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463}
464
465/* Unfetch the last byte back into TOK. */
466
467static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000468 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469}
470
471/* Read a line of input from TOK. Determine encoding
472 if necessary. */
473
474static char *
475decoding_fgets(char *s, int size, struct tok_state *tok)
476{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000477 char *line = NULL;
478 int badchar = 0;
479 for (;;) {
480 if (tok->decoding_state < 0) {
481 /* We already have a codec associated with
482 this input. */
483 line = fp_readl(s, size, tok);
484 break;
485 } else if (tok->decoding_state > 0) {
486 /* We want a 'raw' read. */
487 line = Py_UniversalNewlineFgets(s, size,
488 tok->fp, NULL);
489 break;
490 } else {
491 /* We have not yet determined the encoding.
492 If an encoding is found, use the file-pointer
493 reader functions from now on. */
494 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
495 return error_ret(tok);
496 assert(tok->decoding_state != 0);
497 }
498 }
499 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
500 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
501 return error_ret(tok);
502 }
503 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504#ifndef PGEN
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000505 /* The default encoding is ASCII, so make sure we don't have any
506 non-ASCII bytes in it. */
507 if (line && !tok->encoding) {
508 unsigned char *c;
509 for (c = (unsigned char *)line; *c; c++)
510 if (*c > 127) {
511 badchar = *c;
512 break;
513 }
514 }
515 if (badchar) {
516 char buf[500];
517 /* Need to add 1 to the line number, since this line
518 has not been counted, yet. */
519 sprintf(buf,
520 "Non-ASCII character '\\x%.2x' "
521 "in file %.200s on line %i, "
522 "but no encoding declared; "
523 "see http://www.python.org/peps/pep-0263.html for details",
524 badchar, tok->filename, tok->lineno + 1);
525 PyErr_SetString(PyExc_SyntaxError, buf);
526 return error_ret(tok);
527 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000529 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530}
531
532static int
533decoding_feof(struct tok_state *tok)
534{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000535 if (tok->decoding_state >= 0) {
536 return feof(tok->fp);
537 } else {
538 PyObject* buf = tok->decoding_buffer;
539 if (buf == NULL) {
540 buf = PyObject_CallObject(tok->decoding_readline, NULL);
541 if (buf == NULL) {
542 error_ret(tok);
543 return 1;
544 } else {
545 tok->decoding_buffer = buf;
546 }
547 }
548 return PyObject_Length(buf) == 0;
549 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550}
551
552/* Fetch a byte from TOK, using the string buffer. */
553
Tim Petersc9d78aa2006-03-26 23:27:58 +0000554static int
555buf_getc(struct tok_state *tok) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000556 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557}
558
559/* Unfetch a byte from TOK, using the string buffer. */
560
Tim Petersc9d78aa2006-03-26 23:27:58 +0000561static void
562buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000563 tok->str--;
564 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565}
566
567/* Set the readline function for TOK to ENC. For the string-based
568 tokenizer, this means to just record the encoding. */
569
Tim Petersc9d78aa2006-03-26 23:27:58 +0000570static int
571buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000572 tok->enc = enc;
573 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574}
575
576/* Return a UTF-8 encoding Python string object from the
577 C byte string STR, which is encoded with ENC. */
578
Martin v. Löwis019934b2002-08-07 12:33:18 +0000579#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580static PyObject *
581translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000582 PyObject *utf8;
583 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
584 if (buf == NULL)
585 return NULL;
586 utf8 = PyUnicode_AsUTF8String(buf);
587 Py_DECREF(buf);
588 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000590#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591
592/* Decode a byte string STR for use as the buffer of TOK.
593 Look for encoding declarations inside STR, and record them
594 inside TOK. */
595
596static const char *
597decode_str(const char *str, struct tok_state *tok)
598{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000599 PyObject* utf8 = NULL;
600 const char *s;
601 const char *newl[2] = {NULL, NULL};
602 int lineno = 0;
603 tok->enc = NULL;
604 tok->str = str;
605 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
606 return error_ret(tok);
607 str = tok->str; /* string after BOM if any */
608 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000609#ifdef Py_USING_UNICODE
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000610 if (tok->enc != NULL) {
611 utf8 = translate_into_utf8(str, tok->enc);
612 if (utf8 == NULL)
613 return error_ret(tok);
614 str = PyString_AsString(utf8);
615 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000616#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000617 for (s = str;; s++) {
618 if (*s == '\0') break;
619 else if (*s == '\n') {
620 assert(lineno < 2);
621 newl[lineno] = s;
622 lineno++;
623 if (lineno == 2) break;
624 }
625 }
626 tok->enc = NULL;
627 /* need to check line 1 and 2 separately since check_coding_spec
628 assumes a single line as input */
629 if (newl[0]) {
630 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
631 return error_ret(tok);
632 if (tok->enc == NULL && newl[1]) {
633 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
634 tok, buf_setreadl))
635 return error_ret(tok);
636 }
637 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000638#ifdef Py_USING_UNICODE
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000639 if (tok->enc != NULL) {
640 assert(utf8 == NULL);
641 utf8 = translate_into_utf8(str, tok->enc);
642 if (utf8 == NULL) {
643 PyErr_Format(PyExc_SyntaxError,
644 "unknown encoding: %s", tok->enc);
645 return error_ret(tok);
646 }
647 str = PyString_AsString(utf8);
648 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000649#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000650 assert(tok->decoding_buffer == NULL);
651 tok->decoding_buffer = utf8; /* CAUTION */
652 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653}
654
655#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656
657/* Set up tokenizer for string */
658
659struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000660PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
665 str = (char *)decode_str(str, tok);
666 if (str == NULL) {
667 PyTokenizer_Free(tok);
668 return NULL;
669 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000670
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000671 /* XXX: constify members. */
672 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
673 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000674}
675
676
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000677/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678
679struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000680PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000682 struct tok_state *tok = tok_new();
683 if (tok == NULL)
684 return NULL;
685 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
686 PyTokenizer_Free(tok);
687 return NULL;
688 }
689 tok->cur = tok->inp = tok->buf;
690 tok->end = tok->buf + BUFSIZ;
691 tok->fp = fp;
692 tok->prompt = ps1;
693 tok->nextprompt = ps2;
694 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000695}
696
697
698/* Free a tok_state structure */
699
700void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000701PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000703 if (tok->encoding != NULL)
704 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000705#ifndef PGEN
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000706 Py_XDECREF(tok->decoding_readline);
707 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000708#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000709 if (tok->fp != NULL && tok->buf != NULL)
710 PyMem_FREE(tok->buf);
711 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712}
713
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000714#if !defined(PGEN) && defined(Py_USING_UNICODE)
715static int
716tok_stdin_decode(struct tok_state *tok, char **inp)
717{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000718 PyObject *enc, *sysstdin, *decoded, *utf8;
719 const char *encoding;
720 char *converted;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000721
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000722 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
723 return 0;
724 sysstdin = PySys_GetObject("stdin");
725 if (sysstdin == NULL || !PyFile_Check(sysstdin))
726 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000727
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000728 enc = ((PyFileObject *)sysstdin)->f_encoding;
729 if (enc == NULL || !PyString_Check(enc))
730 return 0;
731 Py_INCREF(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000732
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000733 encoding = PyString_AsString(enc);
734 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
735 if (decoded == NULL)
736 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000737
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000738 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
739 Py_DECREF(decoded);
740 if (utf8 == NULL)
741 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000742
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000743 assert(PyString_Check(utf8));
744 converted = new_string(PyString_AS_STRING(utf8),
745 PyString_GET_SIZE(utf8));
746 Py_DECREF(utf8);
747 if (converted == NULL)
748 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000749
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000750 PyMem_FREE(*inp);
751 *inp = converted;
752 if (tok->encoding != NULL)
753 PyMem_FREE(tok->encoding);
754 tok->encoding = new_string(encoding, strlen(encoding));
755 if (tok->encoding == NULL)
756 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000757
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000758 Py_DECREF(enc);
759 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000760
761error_nomem:
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000762 Py_DECREF(enc);
763 tok->done = E_NOMEM;
764 return -1;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000765
766error_clear:
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000767 Py_DECREF(enc);
768 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
769 tok->done = E_ERROR;
770 return -1;
771 }
772 /* Fallback to iso-8859-1: for backward compatibility */
773 PyErr_Clear();
774 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000775}
776#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777
778/* Get next char, updating state; error code goes into tok->done */
779
780static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000781tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000782{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000783 for (;;) {
784 if (tok->cur != tok->inp) {
785 return Py_CHARMASK(*tok->cur++); /* Fast path */
786 }
787 if (tok->done != E_OK)
788 return EOF;
789 if (tok->fp == NULL) {
790 char *end = strchr(tok->inp, '\n');
791 if (end != NULL)
792 end++;
793 else {
794 end = strchr(tok->inp, '\0');
795 if (end == tok->inp) {
796 tok->done = E_EOF;
797 return EOF;
798 }
799 }
800 if (tok->start == NULL)
801 tok->buf = tok->cur;
802 tok->line_start = tok->cur;
803 tok->lineno++;
804 tok->inp = end;
805 return Py_CHARMASK(*tok->cur++);
806 }
807 if (tok->prompt != NULL) {
808 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
809 if (tok->nextprompt != NULL)
810 tok->prompt = tok->nextprompt;
811 if (newtok == NULL)
812 tok->done = E_INTR;
813 else if (*newtok == '\0') {
814 PyMem_FREE(newtok);
815 tok->done = E_EOF;
816 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000817#if !defined(PGEN) && defined(Py_USING_UNICODE)
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000818 else if (tok_stdin_decode(tok, &newtok) != 0)
819 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000820#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000821 else if (tok->start != NULL) {
822 size_t start = tok->start - tok->buf;
823 size_t oldlen = tok->cur - tok->buf;
824 size_t newlen = oldlen + strlen(newtok);
825 char *buf = tok->buf;
826 buf = (char *)PyMem_REALLOC(buf, newlen+1);
827 tok->lineno++;
828 if (buf == NULL) {
829 PyMem_FREE(tok->buf);
830 tok->buf = NULL;
831 PyMem_FREE(newtok);
832 tok->done = E_NOMEM;
833 return EOF;
834 }
835 tok->buf = buf;
836 tok->cur = tok->buf + oldlen;
837 tok->line_start = tok->cur;
838 strcpy(tok->buf + oldlen, newtok);
839 PyMem_FREE(newtok);
840 tok->inp = tok->buf + newlen;
841 tok->end = tok->inp + 1;
842 tok->start = tok->buf + start;
843 }
844 else {
845 tok->lineno++;
846 if (tok->buf != NULL)
847 PyMem_FREE(tok->buf);
848 tok->buf = newtok;
849 tok->line_start = tok->buf;
850 tok->cur = tok->buf;
851 tok->line_start = tok->buf;
852 tok->inp = strchr(tok->buf, '\0');
853 tok->end = tok->inp + 1;
854 }
855 }
856 else {
857 int done = 0;
858 Py_ssize_t cur = 0;
859 char *pt;
860 if (tok->start == NULL) {
861 if (tok->buf == NULL) {
862 tok->buf = (char *)
863 PyMem_MALLOC(BUFSIZ);
864 if (tok->buf == NULL) {
865 tok->done = E_NOMEM;
866 return EOF;
867 }
868 tok->end = tok->buf + BUFSIZ;
869 }
870 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
871 tok) == NULL) {
872 tok->done = E_EOF;
873 done = 1;
874 }
875 else {
876 tok->done = E_OK;
877 tok->inp = strchr(tok->buf, '\0');
878 done = tok->inp[-1] == '\n';
879 }
880 }
881 else {
882 cur = tok->cur - tok->buf;
883 if (decoding_feof(tok)) {
884 tok->done = E_EOF;
885 done = 1;
886 }
887 else
888 tok->done = E_OK;
889 }
890 tok->lineno++;
891 /* Read until '\n' or EOF */
892 while (!done) {
893 Py_ssize_t curstart = tok->start == NULL ? -1 :
894 tok->start - tok->buf;
895 Py_ssize_t curvalid = tok->inp - tok->buf;
896 Py_ssize_t newsize = curvalid + BUFSIZ;
897 char *newbuf = tok->buf;
898 newbuf = (char *)PyMem_REALLOC(newbuf,
899 newsize);
900 if (newbuf == NULL) {
901 tok->done = E_NOMEM;
902 tok->cur = tok->inp;
903 return EOF;
904 }
905 tok->buf = newbuf;
906 tok->inp = tok->buf + curvalid;
907 tok->end = tok->buf + newsize;
908 tok->start = curstart < 0 ? NULL :
909 tok->buf + curstart;
910 if (decoding_fgets(tok->inp,
911 (int)(tok->end - tok->inp),
912 tok) == NULL) {
913 /* Break out early on decoding
914 errors, as tok->buf will be NULL
915 */
916 if (tok->decoding_erred)
917 return EOF;
918 /* Last line does not end in \n,
919 fake one */
920 strcpy(tok->inp, "\n");
921 }
922 tok->inp = strchr(tok->inp, '\0');
923 done = tok->inp[-1] == '\n';
924 }
925 if (tok->buf != NULL) {
926 tok->cur = tok->buf + cur;
927 tok->line_start = tok->cur;
928 /* replace "\r\n" with "\n" */
929 /* For Mac leave the \r, giving a syntax error */
930 pt = tok->inp - 2;
931 if (pt >= tok->buf && *pt == '\r') {
932 *pt++ = '\n';
933 *pt = '\0';
934 tok->inp = pt;
935 }
936 }
937 }
938 if (tok->done != E_OK) {
939 if (tok->prompt != NULL)
940 PySys_WriteStderr("\n");
941 tok->cur = tok->inp;
942 return EOF;
943 }
944 }
945 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946}
947
948
949/* Back-up one character */
950
951static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000952tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000954 if (c != EOF) {
955 if (--tok->cur < tok->buf)
956 Py_FatalError("tok_backup: begin of buffer");
957 if (*tok->cur != c)
958 *tok->cur = c;
959 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000960}
961
962
963/* Return the token corresponding to a single character */
964
965int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000966PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000967{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000968 switch (c) {
969 case '(': return LPAR;
970 case ')': return RPAR;
971 case '[': return LSQB;
972 case ']': return RSQB;
973 case ':': return COLON;
974 case ',': return COMMA;
975 case ';': return SEMI;
976 case '+': return PLUS;
977 case '-': return MINUS;
978 case '*': return STAR;
979 case '/': return SLASH;
980 case '|': return VBAR;
981 case '&': return AMPER;
982 case '<': return LESS;
983 case '>': return GREATER;
984 case '=': return EQUAL;
985 case '.': return DOT;
986 case '%': return PERCENT;
987 case '`': return BACKQUOTE;
988 case '{': return LBRACE;
989 case '}': return RBRACE;
990 case '^': return CIRCUMFLEX;
991 case '~': return TILDE;
992 case '@': return AT;
993 default: return OP;
994 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995}
996
997
Guido van Rossumfbab9051991-10-20 20:25:03 +0000998int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000999PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001000{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001001 switch (c1) {
1002 case '=':
1003 switch (c2) {
1004 case '=': return EQEQUAL;
1005 }
1006 break;
1007 case '!':
1008 switch (c2) {
1009 case '=': return NOTEQUAL;
1010 }
1011 break;
1012 case '<':
1013 switch (c2) {
1014 case '>': return NOTEQUAL;
1015 case '=': return LESSEQUAL;
1016 case '<': return LEFTSHIFT;
1017 }
1018 break;
1019 case '>':
1020 switch (c2) {
1021 case '=': return GREATEREQUAL;
1022 case '>': return RIGHTSHIFT;
1023 }
1024 break;
1025 case '+':
1026 switch (c2) {
1027 case '=': return PLUSEQUAL;
1028 }
1029 break;
1030 case '-':
1031 switch (c2) {
1032 case '=': return MINEQUAL;
1033 }
1034 break;
1035 case '*':
1036 switch (c2) {
1037 case '*': return DOUBLESTAR;
1038 case '=': return STAREQUAL;
1039 }
1040 break;
1041 case '/':
1042 switch (c2) {
1043 case '/': return DOUBLESLASH;
1044 case '=': return SLASHEQUAL;
1045 }
1046 break;
1047 case '|':
1048 switch (c2) {
1049 case '=': return VBAREQUAL;
1050 }
1051 break;
1052 case '%':
1053 switch (c2) {
1054 case '=': return PERCENTEQUAL;
1055 }
1056 break;
1057 case '&':
1058 switch (c2) {
1059 case '=': return AMPEREQUAL;
1060 }
1061 break;
1062 case '^':
1063 switch (c2) {
1064 case '=': return CIRCUMFLEXEQUAL;
1065 }
1066 break;
1067 }
1068 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001069}
1070
Thomas Wouters434d0822000-08-24 20:11:32 +00001071int
1072PyToken_ThreeChars(int c1, int c2, int c3)
1073{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001074 switch (c1) {
1075 case '<':
1076 switch (c2) {
1077 case '<':
1078 switch (c3) {
1079 case '=':
1080 return LEFTSHIFTEQUAL;
1081 }
1082 break;
1083 }
1084 break;
1085 case '>':
1086 switch (c2) {
1087 case '>':
1088 switch (c3) {
1089 case '=':
1090 return RIGHTSHIFTEQUAL;
1091 }
1092 break;
1093 }
1094 break;
1095 case '*':
1096 switch (c2) {
1097 case '*':
1098 switch (c3) {
1099 case '=':
1100 return DOUBLESTAREQUAL;
1101 }
1102 break;
1103 }
1104 break;
1105 case '/':
1106 switch (c2) {
1107 case '/':
1108 switch (c3) {
1109 case '=':
1110 return DOUBLESLASHEQUAL;
1111 }
1112 break;
1113 }
1114 break;
1115 }
1116 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001117}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001118
Guido van Rossum926f13a1998-04-09 21:38:06 +00001119static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001120indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001121{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001122 if (tok->alterror) {
1123 tok->done = E_TABSPACE;
1124 tok->cur = tok->inp;
1125 return 1;
1126 }
1127 if (tok->altwarning) {
1128 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1129 "in indentation\n", tok->filename);
1130 tok->altwarning = 0;
1131 }
1132 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133}
1134
1135
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136/* Get next token, after space stripping etc. */
1137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001138static int
1139tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001140{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001141 register int c;
1142 int blankline;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001144 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001145 nextline:
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001146 tok->start = NULL;
1147 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001148
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001149 /* Get indentation level */
1150 if (tok->atbol) {
1151 register int col = 0;
1152 register int altcol = 0;
1153 tok->atbol = 0;
1154 for (;;) {
1155 c = tok_nextc(tok);
1156 if (c == ' ')
1157 col++, altcol++;
1158 else if (c == '\t') {
1159 col = (col/tok->tabsize + 1) * tok->tabsize;
1160 altcol = (altcol/tok->alttabsize + 1)
1161 * tok->alttabsize;
1162 }
1163 else if (c == '\014') /* Control-L (formfeed) */
1164 col = altcol = 0; /* For Emacs users */
1165 else
1166 break;
1167 }
1168 tok_backup(tok, c);
1169 if (c == '#' || c == '\n') {
1170 /* Lines with only whitespace and/or comments
1171 shouldn't affect the indentation and are
1172 not passed to the parser as NEWLINE tokens,
1173 except *totally* empty lines in interactive
1174 mode, which signal the end of a command group. */
1175 if (col == 0 && c == '\n' && tok->prompt != NULL)
1176 blankline = 0; /* Let it through */
1177 else
1178 blankline = 1; /* Ignore completely */
1179 /* We can't jump back right here since we still
1180 may need to skip to the end of a comment */
1181 }
1182 if (!blankline && tok->level == 0) {
1183 if (col == tok->indstack[tok->indent]) {
1184 /* No change */
1185 if (altcol != tok->altindstack[tok->indent]) {
1186 if (indenterror(tok))
1187 return ERRORTOKEN;
1188 }
1189 }
1190 else if (col > tok->indstack[tok->indent]) {
1191 /* Indent -- always one */
1192 if (tok->indent+1 >= MAXINDENT) {
1193 tok->done = E_TOODEEP;
1194 tok->cur = tok->inp;
1195 return ERRORTOKEN;
1196 }
1197 if (altcol <= tok->altindstack[tok->indent]) {
1198 if (indenterror(tok))
1199 return ERRORTOKEN;
1200 }
1201 tok->pendin++;
1202 tok->indstack[++tok->indent] = col;
1203 tok->altindstack[tok->indent] = altcol;
1204 }
1205 else /* col < tok->indstack[tok->indent] */ {
1206 /* Dedent -- any number, must be consistent */
1207 while (tok->indent > 0 &&
1208 col < tok->indstack[tok->indent]) {
1209 tok->pendin--;
1210 tok->indent--;
1211 }
1212 if (col != tok->indstack[tok->indent]) {
1213 tok->done = E_DEDENT;
1214 tok->cur = tok->inp;
1215 return ERRORTOKEN;
1216 }
1217 if (altcol != tok->altindstack[tok->indent]) {
1218 if (indenterror(tok))
1219 return ERRORTOKEN;
1220 }
1221 }
1222 }
1223 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001224
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001225 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001226
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001227 /* Return pending indents/dedents */
1228 if (tok->pendin != 0) {
1229 if (tok->pendin < 0) {
1230 tok->pendin++;
1231 return DEDENT;
1232 }
1233 else {
1234 tok->pendin--;
1235 return INDENT;
1236 }
1237 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001238
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001239 again:
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001240 tok->start = NULL;
1241 /* Skip spaces */
1242 do {
1243 c = tok_nextc(tok);
1244 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001245
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001246 /* Set start of current token */
1247 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001248
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001249 /* Skip comment, while looking for tab-setting magic */
1250 if (c == '#') {
1251 static char *tabforms[] = {
1252 "tab-width:", /* Emacs */
1253 ":tabstop=", /* vim, full form */
1254 ":ts=", /* vim, abbreviated form */
1255 "set tabsize=", /* will vi never die? */
1256 /* more templates can be added here to support other editors */
1257 };
1258 char cbuf[80];
1259 char *tp, **cp;
1260 tp = cbuf;
1261 do {
1262 *tp++ = c = tok_nextc(tok);
1263 } while (c != EOF && c != '\n' &&
1264 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1265 *tp = '\0';
1266 for (cp = tabforms;
1267 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1268 cp++) {
1269 if ((tp = strstr(cbuf, *cp))) {
1270 int newsize = atoi(tp + strlen(*cp));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001271
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001272 if (newsize >= 1 && newsize <= 40) {
1273 tok->tabsize = newsize;
1274 if (Py_VerboseFlag)
1275 PySys_WriteStderr(
1276 "Tab size set to %d\n",
1277 newsize);
1278 }
1279 }
1280 }
1281 while (c != EOF && c != '\n')
1282 c = tok_nextc(tok);
1283 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001284
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001285 /* Check for EOF and errors now */
1286 if (c == EOF) {
1287 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1288 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001289
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001290 /* Identifier (most frequent token!) */
1291 if (isalpha(c) || c == '_') {
1292 /* Process r"", u"" and ur"" */
1293 switch (c) {
1294 case 'b':
1295 case 'B':
1296 c = tok_nextc(tok);
1297 if (c == 'r' || c == 'R')
1298 c = tok_nextc(tok);
1299 if (c == '"' || c == '\'')
1300 goto letter_quote;
1301 break;
1302 case 'r':
1303 case 'R':
1304 c = tok_nextc(tok);
1305 if (c == '"' || c == '\'')
1306 goto letter_quote;
1307 break;
1308 case 'u':
1309 case 'U':
1310 c = tok_nextc(tok);
1311 if (c == 'r' || c == 'R')
1312 c = tok_nextc(tok);
1313 if (c == '"' || c == '\'')
1314 goto letter_quote;
1315 break;
1316 }
1317 while (isalnum(c) || c == '_') {
1318 c = tok_nextc(tok);
1319 }
1320 tok_backup(tok, c);
1321 *p_start = tok->start;
1322 *p_end = tok->cur;
1323 return NAME;
1324 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001325
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001326 /* Newline */
1327 if (c == '\n') {
1328 tok->atbol = 1;
1329 if (blankline || tok->level > 0)
1330 goto nextline;
1331 *p_start = tok->start;
1332 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1333 tok->cont_line = 0;
1334 return NEWLINE;
1335 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001336
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001337 /* Period or number starting with period? */
1338 if (c == '.') {
1339 c = tok_nextc(tok);
1340 if (isdigit(c)) {
1341 goto fraction;
1342 }
1343 else {
1344 tok_backup(tok, c);
1345 *p_start = tok->start;
1346 *p_end = tok->cur;
1347 return DOT;
1348 }
1349 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001350
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001351 /* Number */
1352 if (isdigit(c)) {
1353 if (c == '0') {
1354 /* Hex, octal or binary -- maybe. */
1355 c = tok_nextc(tok);
1356 if (c == '.')
1357 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001358#ifndef WITHOUT_COMPLEX
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001359 if (c == 'j' || c == 'J')
1360 goto imaginary;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001361#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001362 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001363
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001364 /* Hex */
1365 c = tok_nextc(tok);
1366 if (!isxdigit(c)) {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
1371 do {
1372 c = tok_nextc(tok);
1373 } while (isxdigit(c));
1374 }
1375 else if (c == 'o' || c == 'O') {
1376 /* Octal */
1377 c = tok_nextc(tok);
1378 if (c < '0' || c >= '8') {
1379 tok->done = E_TOKEN;
1380 tok_backup(tok, c);
1381 return ERRORTOKEN;
1382 }
1383 do {
1384 c = tok_nextc(tok);
1385 } while ('0' <= c && c < '8');
1386 }
1387 else if (c == 'b' || c == 'B') {
1388 /* Binary */
1389 c = tok_nextc(tok);
1390 if (c != '0' && c != '1') {
1391 tok->done = E_TOKEN;
1392 tok_backup(tok, c);
1393 return ERRORTOKEN;
1394 }
1395 do {
1396 c = tok_nextc(tok);
1397 } while (c == '0' || c == '1');
1398 }
1399 else {
1400 int found_decimal = 0;
1401 /* Octal; c is first char of it */
1402 /* There's no 'isoctdigit' macro, sigh */
1403 while ('0' <= c && c < '8') {
1404 c = tok_nextc(tok);
1405 }
1406 if (isdigit(c)) {
1407 found_decimal = 1;
1408 do {
1409 c = tok_nextc(tok);
1410 } while (isdigit(c));
1411 }
1412 if (c == '.')
1413 goto fraction;
1414 else if (c == 'e' || c == 'E')
1415 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001416#ifndef WITHOUT_COMPLEX
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001417 else if (c == 'j' || c == 'J')
1418 goto imaginary;
Tim Petersd507dab2001-08-30 20:51:59 +00001419#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001420 else if (found_decimal) {
1421 tok->done = E_TOKEN;
1422 tok_backup(tok, c);
1423 return ERRORTOKEN;
1424 }
1425 }
1426 if (c == 'l' || c == 'L')
1427 c = tok_nextc(tok);
1428 }
1429 else {
1430 /* Decimal */
1431 do {
1432 c = tok_nextc(tok);
1433 } while (isdigit(c));
1434 if (c == 'l' || c == 'L')
1435 c = tok_nextc(tok);
1436 else {
1437 /* Accept floating point numbers. */
1438 if (c == '.') {
1439 fraction:
1440 /* Fraction */
1441 do {
1442 c = tok_nextc(tok);
1443 } while (isdigit(c));
1444 }
1445 if (c == 'e' || c == 'E') {
1446 exponent:
1447 /* Exponent part */
1448 c = tok_nextc(tok);
1449 if (c == '+' || c == '-')
1450 c = tok_nextc(tok);
1451 if (!isdigit(c)) {
1452 tok->done = E_TOKEN;
1453 tok_backup(tok, c);
1454 return ERRORTOKEN;
1455 }
1456 do {
1457 c = tok_nextc(tok);
1458 } while (isdigit(c));
1459 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001460#ifndef WITHOUT_COMPLEX
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001461 if (c == 'j' || c == 'J')
1462 /* Imaginary part */
1463 imaginary:
1464 c = tok_nextc(tok);
Guido van Rossumf595fde1996-01-12 01:31:58 +00001465#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001466 }
1467 }
1468 tok_backup(tok, c);
1469 *p_start = tok->start;
1470 *p_end = tok->cur;
1471 return NUMBER;
1472 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001473
1474 letter_quote:
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001475 /* String */
1476 if (c == '\'' || c == '"') {
1477 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1478 int quote = c;
1479 int triple = 0;
1480 int tripcount = 0;
1481 for (;;) {
1482 c = tok_nextc(tok);
1483 if (c == '\n') {
1484 if (!triple) {
1485 tok->done = E_EOLS;
1486 tok_backup(tok, c);
1487 return ERRORTOKEN;
1488 }
1489 tripcount = 0;
1490 tok->cont_line = 1; /* multiline string. */
1491 }
1492 else if (c == EOF) {
1493 if (triple)
1494 tok->done = E_EOFS;
1495 else
1496 tok->done = E_EOLS;
1497 tok->cur = tok->inp;
1498 return ERRORTOKEN;
1499 }
1500 else if (c == quote) {
1501 tripcount++;
1502 if (tok->cur - tok->start == quote2) {
1503 c = tok_nextc(tok);
1504 if (c == quote) {
1505 triple = 1;
1506 tripcount = 0;
1507 continue;
1508 }
1509 tok_backup(tok, c);
1510 }
1511 if (!triple || tripcount == 3)
1512 break;
1513 }
1514 else if (c == '\\') {
1515 tripcount = 0;
1516 c = tok_nextc(tok);
1517 if (c == EOF) {
1518 tok->done = E_EOLS;
1519 tok->cur = tok->inp;
1520 return ERRORTOKEN;
1521 }
1522 }
1523 else
1524 tripcount = 0;
1525 }
1526 *p_start = tok->start;
1527 *p_end = tok->cur;
1528 return STRING;
1529 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001530
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001531 /* Line continuation */
1532 if (c == '\\') {
1533 c = tok_nextc(tok);
1534 if (c != '\n') {
1535 tok->done = E_LINECONT;
1536 tok->cur = tok->inp;
1537 return ERRORTOKEN;
1538 }
1539 tok->cont_line = 1;
1540 goto again; /* Read next line */
1541 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001542
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001543 /* Check for two-character token */
1544 {
1545 int c2 = tok_nextc(tok);
1546 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001547#ifndef PGEN
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001548 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1549 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1550 "<> not supported in 3.x; use !=",
1551 tok->filename, tok->lineno,
1552 NULL, NULL)) {
1553 return ERRORTOKEN;
1554 }
1555 }
Christian Heimes02c9ab52007-11-23 12:12:02 +00001556#endif
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001557 if (token != OP) {
1558 int c3 = tok_nextc(tok);
1559 int token3 = PyToken_ThreeChars(c, c2, c3);
1560 if (token3 != OP) {
1561 token = token3;
1562 } else {
1563 tok_backup(tok, c3);
1564 }
1565 *p_start = tok->start;
1566 *p_end = tok->cur;
1567 return token;
1568 }
1569 tok_backup(tok, c2);
1570 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001571
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001572 /* Keep track of parentheses nesting level */
1573 switch (c) {
1574 case '(':
1575 case '[':
1576 case '{':
1577 tok->level++;
1578 break;
1579 case ')':
1580 case ']':
1581 case '}':
1582 tok->level--;
1583 break;
1584 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001585
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001586 /* Punctuation character */
1587 *p_start = tok->start;
1588 *p_end = tok->cur;
1589 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001590}
1591
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001592int
1593PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1594{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001595 int result = tok_get(tok, p_start, p_end);
1596 if (tok->decoding_erred) {
1597 result = ERRORTOKEN;
1598 tok->done = E_DECODE;
1599 }
1600 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001601}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001602
Martin v. Löwisa5136192007-09-04 14:19:28 +00001603/* This function is only called from parsetok. However, it cannot live
1604 there, as it must be empty for PGEN, and we can check for PGEN only
1605 in this file. */
1606
Christian Heimes082c9b02008-01-23 14:20:50 +00001607#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001608char*
1609PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1610{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001611 return NULL;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001612}
1613#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001614#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001615static PyObject *
1616dec_utf8(const char *enc, const char *text, size_t len) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001617 PyObject *ret = NULL;
1618 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1619 if (unicode_text) {
1620 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1621 Py_DECREF(unicode_text);
1622 }
1623 if (!ret) {
1624 PyErr_Clear();
1625 }
1626 return ret;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001627}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001628char *
1629PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1630{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001631 char *text = NULL;
1632 if (tok->encoding) {
1633 /* convert source to original encondig */
1634 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1635 if (lineobj != NULL) {
1636 int linelen = PyString_Size(lineobj);
1637 const char *line = PyString_AsString(lineobj);
1638 text = PyObject_MALLOC(linelen + 1);
1639 if (text != NULL && line != NULL) {
1640 if (linelen)
1641 strncpy(text, line, linelen);
1642 text[linelen] = '\0';
1643 }
1644 Py_DECREF(lineobj);
1645
1646 /* adjust error offset */
1647 if (*offset > 1) {
1648 PyObject *offsetobj = dec_utf8(tok->encoding,
1649 tok->buf, *offset-1);
1650 if (offsetobj) {
1651 *offset = PyString_Size(offsetobj) + 1;
1652 Py_DECREF(offsetobj);
1653 }
1654 }
1655
1656 }
1657 }
1658 return text;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001659
1660}
Georg Brandl76b30d12008-01-07 18:41:34 +00001661#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001662#endif
1663
Martin v. Löwisa5136192007-09-04 14:19:28 +00001664
Guido van Rossum408027e1996-12-30 16:17:54 +00001665#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001666
1667void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001668tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001669{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001670 printf("%s", _PyParser_TokenNames[type]);
1671 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1672 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001673}
1674
1675#endif