blob: fbbd0bc7fb3288e2c4d128414fc11ac38d3d3cbb [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
Benjamin Peterson4ceeeb02010-04-03 22:48:51 +000096/* Ensure that the locale does not interfere with tokenization. */
97
98static int
99ascii_isalpha(int c)
100{
101 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
102}
103
104static int
105ascii_isalnum(int c)
106{
107 return ascii_isalpha(c) || ('0' <= c && c <= '9');
108}
109
110
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111/* Create and initialize a new tok_state structure */
112
113static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000114tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000115{
Anthony Baxter11490022006-04-11 05:39:14 +0000116 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
117 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000118 if (tok == NULL)
119 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000120 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000121 tok->done = E_OK;
122 tok->fp = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000123 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000124 tok->tabsize = TABSIZE;
125 tok->indent = 0;
126 tok->indstack[0] = 0;
127 tok->atbol = 1;
128 tok->pendin = 0;
129 tok->prompt = tok->nextprompt = NULL;
130 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000131 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000132 tok->filename = NULL;
133 tok->altwarning = 0;
134 tok->alterror = 0;
135 tok->alttabsize = 1;
136 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_state = 0;
138 tok->decoding_erred = 0;
139 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersone36199b2009-11-12 23:39:44 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersone36199b2009-11-12 23:39:44 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersone36199b2009-11-12 23:39:44 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Benjamin Peterson4ceeeb02010-04-03 22:48:51 +0000248 while (ascii_isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == 1); /* raw */
286 if (strcmp(cs, "utf-8") == 0 ||
287 strcmp(cs, "iso-8859-1") == 0) {
288 tok->encoding = cs;
289 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000290#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000291 r = set_readline(tok, cs);
292 if (r) {
293 tok->encoding = cs;
294 tok->decoding_state = -1;
295 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000296 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000297 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000298#else
299 /* Without Unicode support, we cannot
300 process the coding spec. Since there
301 won't be any Unicode literals, that
302 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000303 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000304#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305 }
306 } else { /* then, compare cs with BOM */
307 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000308 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309 }
310 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000311 if (!r) {
312 cs = tok->encoding;
313 if (!cs)
314 cs = "with BOM";
315 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
316 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000317 return r;
318}
319
320/* See whether the file starts with a BOM. If it does,
321 invoke the set_readline function with the new encoding.
322 Return 1 on success, 0 on failure. */
323
324static int
325check_bom(int get_char(struct tok_state *),
326 void unget_char(int, struct tok_state *),
327 int set_readline(struct tok_state *, const char *),
328 struct tok_state *tok)
329{
Victor Stinnerd23d3932010-03-02 23:20:02 +0000330 int ch1, ch2, ch3;
331 ch1 = get_char(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->decoding_state = 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000333 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000334 return 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000335 } else if (ch1 == 0xEF) {
336 ch2 = get_char(tok);
337 if (ch2 != 0xBB) {
338 unget_char(ch2, tok);
339 unget_char(ch1, tok);
340 return 1;
341 }
342 ch3 = get_char(tok);
343 if (ch3 != 0xBF) {
344 unget_char(ch3, tok);
345 unget_char(ch2, tok);
346 unget_char(ch1, tok);
347 return 1;
348 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349#if 0
350 /* Disable support for UTF-16 BOMs until a decision
351 is made whether this needs to be supported. */
Victor Stinnerd23d3932010-03-02 23:20:02 +0000352 } else if (ch1 == 0xFE) {
353 ch2 = get_char(tok);
354 if (ch2 != 0xFF) {
355 unget_char(ch2, tok);
356 unget_char(ch1, tok);
357 return 1;
358 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000359 if (!set_readline(tok, "utf-16-be"))
360 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361 tok->decoding_state = -1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000362 } else if (ch1 == 0xFF) {
363 ch2 = get_char(tok);
364 if (ch2 != 0xFE) {
365 unget_char(ch2, tok);
366 unget_char(ch1, tok);
367 return 1;
368 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000369 if (!set_readline(tok, "utf-16-le"))
370 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371 tok->decoding_state = -1;
372#endif
373 } else {
Victor Stinnerd23d3932010-03-02 23:20:02 +0000374 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000375 return 1;
376 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000377 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000378 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
380 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381}
382
383/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000384 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000385
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 On entry, tok->decoding_buffer will be one of:
387 1) NULL: need to call tok->decoding_readline to get a new line
388 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
389 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000390 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391 (in the s buffer) to copy entire contents of the line read
392 by tok->decoding_readline. tok->decoding_buffer has the overflow.
393 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000394 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395 reached): see tok_nextc and its calls to decoding_fgets.
396*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397
398static char *
399fp_readl(char *s, int size, struct tok_state *tok)
400{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000401#ifndef Py_USING_UNICODE
402 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000403 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000404 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000405#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000406 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000407 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000408 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000409 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000410
411 /* Ask for one less byte so we can terminate it */
412 assert(size > 0);
413 size--;
414
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000415 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000417 if (buf == NULL)
418 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000419 } else {
420 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000421 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000422 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424 if (utf8 == NULL) {
425 utf8 = PyUnicode_AsUTF8String(buf);
426 Py_DECREF(buf);
427 if (utf8 == NULL)
428 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000430 str = PyString_AsString(utf8);
431 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000432 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000433 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000434 if (tok->decoding_buffer == NULL) {
435 Py_DECREF(utf8);
436 return error_ret(tok);
437 }
438 utf8len = size;
439 }
440 memcpy(s, str, utf8len);
441 s[utf8len] = '\0';
442 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000443 if (utf8len == 0)
444 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000445 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000446#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447}
448
449/* Set the readline function for TOK to a StreamReader's
450 readline function. The StreamReader is named ENC.
451
452 This function is called from check_bom and check_coding_spec.
453
454 ENC is usually identical to the future value of tok->encoding,
455 except for the (currently unsupported) case of UTF-16.
456
457 Return 1 on success, 0 on failure. */
458
459static int
460fp_setreadl(struct tok_state *tok, const char* enc)
461{
462 PyObject *reader, *stream, *readline;
463
Martin v. Löwis95292d62002-12-11 14:04:59 +0000464 /* XXX: constify filename argument. */
465 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000466 if (stream == NULL)
467 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468
469 reader = PyCodec_StreamReader(enc, stream, NULL);
470 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000471 if (reader == NULL)
472 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000473
474 readline = PyObject_GetAttrString(reader, "readline");
475 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000476 if (readline == NULL)
477 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
479 tok->decoding_readline = readline;
480 return 1;
481}
482
483/* Fetch the next byte from TOK. */
484
485static int fp_getc(struct tok_state *tok) {
486 return getc(tok->fp);
487}
488
489/* Unfetch the last byte back into TOK. */
490
491static void fp_ungetc(int c, struct tok_state *tok) {
492 ungetc(c, tok->fp);
493}
494
495/* Read a line of input from TOK. Determine encoding
496 if necessary. */
497
498static char *
499decoding_fgets(char *s, int size, struct tok_state *tok)
500{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000501 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000502 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000503 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504 if (tok->decoding_state < 0) {
505 /* We already have a codec associated with
506 this input. */
507 line = fp_readl(s, size, tok);
508 break;
509 } else if (tok->decoding_state > 0) {
510 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000511 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000513 break;
514 } else {
515 /* We have not yet determined the encoding.
516 If an encoding is found, use the file-pointer
517 reader functions from now on. */
518 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
519 return error_ret(tok);
520 assert(tok->decoding_state != 0);
521 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000522 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
524 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
525 return error_ret(tok);
526 }
527 }
528#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000529 /* The default encoding is ASCII, so make sure we don't have any
530 non-ASCII bytes in it. */
531 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000533 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000534 if (*c > 127) {
535 badchar = *c;
536 break;
537 }
538 }
539 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000540 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000541 /* Need to add 1 to the line number, since this line
542 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000543 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000544 "Non-ASCII character '\\x%.2x' "
545 "in file %.200s on line %i, "
546 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000547 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000548 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000549 PyErr_SetString(PyExc_SyntaxError, buf);
550 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 }
552#endif
553 return line;
554}
555
556static int
557decoding_feof(struct tok_state *tok)
558{
559 if (tok->decoding_state >= 0) {
560 return feof(tok->fp);
561 } else {
562 PyObject* buf = tok->decoding_buffer;
563 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000564 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 if (buf == NULL) {
566 error_ret(tok);
567 return 1;
568 } else {
569 tok->decoding_buffer = buf;
570 }
571 }
572 return PyObject_Length(buf) == 0;
573 }
574}
575
576/* Fetch a byte from TOK, using the string buffer. */
577
Tim Petersc9d78aa2006-03-26 23:27:58 +0000578static int
579buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000580 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581}
582
583/* Unfetch a byte from TOK, using the string buffer. */
584
Tim Petersc9d78aa2006-03-26 23:27:58 +0000585static void
586buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000588 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589}
590
591/* Set the readline function for TOK to ENC. For the string-based
592 tokenizer, this means to just record the encoding. */
593
Tim Petersc9d78aa2006-03-26 23:27:58 +0000594static int
595buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 tok->enc = enc;
597 return 1;
598}
599
600/* Return a UTF-8 encoding Python string object from the
601 C byte string STR, which is encoded with ENC. */
602
Martin v. Löwis019934b2002-08-07 12:33:18 +0000603#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604static PyObject *
605translate_into_utf8(const char* str, const char* enc) {
606 PyObject *utf8;
607 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
608 if (buf == NULL)
609 return NULL;
610 utf8 = PyUnicode_AsUTF8String(buf);
611 Py_DECREF(buf);
612 return utf8;
613}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000614#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615
Benjamin Petersone36199b2009-11-12 23:39:44 +0000616
617static char *
618translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000619 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000620 char *buf, *current;
Benjamin Peterson42d63842009-12-06 17:37:48 +0000621 char c = '\0';
622 buf = PyMem_MALLOC(needed_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000623 if (buf == NULL) {
624 tok->done = E_NOMEM;
625 return NULL;
626 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000627 for (current = buf; *s; s++, current++) {
628 c = *s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000629 if (skip_next_lf) {
630 skip_next_lf = 0;
631 if (c == '\n') {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000632 c = *++s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000633 if (!c)
634 break;
635 }
636 }
637 if (c == '\r') {
638 skip_next_lf = 1;
639 c = '\n';
640 }
641 *current = c;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000642 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000643 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersone36199b2009-11-12 23:39:44 +0000644 there isn't one already. */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000645 if (exec_input && c != '\n') {
Benjamin Petersone36199b2009-11-12 23:39:44 +0000646 *current = '\n';
647 current++;
648 }
649 *current = '\0';
Benjamin Peterson42d63842009-12-06 17:37:48 +0000650 final_length = current - buf + 1;
651 if (final_length < needed_length && final_length)
Benjamin Petersone36199b2009-11-12 23:39:44 +0000652 /* should never fail */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000653 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000654 return buf;
655}
656
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000657/* Decode a byte string STR for use as the buffer of TOK.
658 Look for encoding declarations inside STR, and record them
659 inside TOK. */
660
661static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000662decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663{
664 PyObject* utf8 = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000665 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000667 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668 int lineno = 0;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000669 tok->input = str = translate_newlines(input, single, tok);
670 if (str == NULL)
671 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000672 tok->enc = NULL;
673 tok->str = str;
674 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000675 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000677 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000678#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679 if (tok->enc != NULL) {
680 utf8 = translate_into_utf8(str, tok->enc);
681 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000682 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000683 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000685#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000686 for (s = str;; s++) {
687 if (*s == '\0') break;
688 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000689 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000690 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 lineno++;
692 if (lineno == 2) break;
693 }
694 }
695 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000696 /* need to check line 1 and 2 separately since check_coding_spec
697 assumes a single line as input */
698 if (newl[0]) {
699 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
700 return error_ret(tok);
701 if (tok->enc == NULL && newl[1]) {
702 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
703 tok, buf_setreadl))
704 return error_ret(tok);
705 }
706 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000707#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000708 if (tok->enc != NULL) {
709 assert(utf8 == NULL);
710 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000711 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000712 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000713 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000715#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000716 assert(tok->decoding_buffer == NULL);
717 tok->decoding_buffer = utf8; /* CAUTION */
718 return str;
719}
720
721#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722
723/* Set up tokenizer for string */
724
725struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000726PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727{
728 struct tok_state *tok = tok_new();
729 if (tok == NULL)
730 return NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000731 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000732 if (str == NULL) {
733 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000735 }
736
Martin v. Löwis95292d62002-12-11 14:04:59 +0000737 /* XXX: constify members. */
738 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 return tok;
740}
741
742
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000743/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744
745struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000746PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747{
748 struct tok_state *tok = tok_new();
749 if (tok == NULL)
750 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000751 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000752 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753 return NULL;
754 }
755 tok->cur = tok->inp = tok->buf;
756 tok->end = tok->buf + BUFSIZ;
757 tok->fp = fp;
758 tok->prompt = ps1;
759 tok->nextprompt = ps2;
760 return tok;
761}
762
763
764/* Free a tok_state structure */
765
766void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000767PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000769 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000770 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000771#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000772 Py_XDECREF(tok->decoding_readline);
773 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000774#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000776 PyMem_FREE(tok->buf);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000777 if (tok->input)
778 PyMem_FREE((char *)tok->input);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000779 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780}
781
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000782#if !defined(PGEN) && defined(Py_USING_UNICODE)
783static int
784tok_stdin_decode(struct tok_state *tok, char **inp)
785{
786 PyObject *enc, *sysstdin, *decoded, *utf8;
787 const char *encoding;
788 char *converted;
789
790 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
791 return 0;
792 sysstdin = PySys_GetObject("stdin");
793 if (sysstdin == NULL || !PyFile_Check(sysstdin))
794 return 0;
795
796 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000797 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000798 return 0;
799 Py_INCREF(enc);
800
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000801 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000802 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
803 if (decoded == NULL)
804 goto error_clear;
805
806 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
807 Py_DECREF(decoded);
808 if (utf8 == NULL)
809 goto error_clear;
810
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000811 assert(PyString_Check(utf8));
812 converted = new_string(PyString_AS_STRING(utf8),
813 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000814 Py_DECREF(utf8);
815 if (converted == NULL)
816 goto error_nomem;
817
Neal Norwitz08062d62006-04-11 08:19:15 +0000818 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000819 *inp = converted;
820 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000821 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000822 tok->encoding = new_string(encoding, strlen(encoding));
823 if (tok->encoding == NULL)
824 goto error_nomem;
825
826 Py_DECREF(enc);
827 return 0;
828
829error_nomem:
830 Py_DECREF(enc);
831 tok->done = E_NOMEM;
832 return -1;
833
834error_clear:
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000835 Py_DECREF(enc);
Victor Stinner66644262010-03-10 22:30:19 +0000836 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
837 tok->done = E_ERROR;
838 return -1;
839 }
840 /* Fallback to iso-8859-1: for backward compatibility */
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000841 PyErr_Clear();
842 return 0;
843}
844#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845
846/* Get next char, updating state; error code goes into tok->done */
847
848static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000849tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000850{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000851 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000852 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000853 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000854 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000855 if (tok->done != E_OK)
856 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000858 char *end = strchr(tok->inp, '\n');
859 if (end != NULL)
860 end++;
861 else {
862 end = strchr(tok->inp, '\0');
863 if (end == tok->inp) {
864 tok->done = E_EOF;
865 return EOF;
866 }
867 }
868 if (tok->start == NULL)
869 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000870 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000871 tok->lineno++;
872 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000873 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000874 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000876 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000877 if (tok->nextprompt != NULL)
878 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000879 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000881 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000882 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883 tok->done = E_EOF;
884 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000885#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000886 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000887 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000888#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000890 size_t start = tok->start - tok->buf;
891 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000892 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000893 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000894 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000895 tok->lineno++;
896 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000897 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000898 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000899 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 tok->done = E_NOMEM;
901 return EOF;
902 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000903 tok->buf = buf;
904 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000905 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000906 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000907 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000908 tok->inp = tok->buf + newlen;
909 tok->end = tok->inp + 1;
910 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000911 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000912 else {
913 tok->lineno++;
914 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000915 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000916 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000917 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000918 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000919 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000920 tok->inp = strchr(tok->buf, '\0');
921 tok->end = tok->inp + 1;
922 }
923 }
924 else {
925 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000926 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000927 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000928 if (tok->start == NULL) {
929 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000930 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000931 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000932 if (tok->buf == NULL) {
933 tok->done = E_NOMEM;
934 return EOF;
935 }
936 tok->end = tok->buf + BUFSIZ;
937 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000938 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
939 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000940 tok->done = E_EOF;
941 done = 1;
942 }
943 else {
944 tok->done = E_OK;
945 tok->inp = strchr(tok->buf, '\0');
946 done = tok->inp[-1] == '\n';
947 }
948 }
949 else {
950 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000951 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000952 tok->done = E_EOF;
953 done = 1;
954 }
955 else
956 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000957 }
958 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000959 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000960 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000961 Py_ssize_t curstart = tok->start == NULL ? -1 :
962 tok->start - tok->buf;
963 Py_ssize_t curvalid = tok->inp - tok->buf;
964 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000965 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000966 newbuf = (char *)PyMem_REALLOC(newbuf,
967 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000968 if (newbuf == NULL) {
969 tok->done = E_NOMEM;
970 tok->cur = tok->inp;
971 return EOF;
972 }
973 tok->buf = newbuf;
974 tok->inp = tok->buf + curvalid;
975 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000976 tok->start = curstart < 0 ? NULL :
977 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000978 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000979 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000980 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000981 /* Break out early on decoding
982 errors, as tok->buf will be NULL
983 */
984 if (tok->decoding_erred)
985 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000986 /* Last line does not end in \n,
987 fake one */
988 strcpy(tok->inp, "\n");
989 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000990 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000991 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000992 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000993 if (tok->buf != NULL) {
994 tok->cur = tok->buf + cur;
995 tok->line_start = tok->cur;
996 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000997 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000998 pt = tok->inp - 2;
999 if (pt >= tok->buf && *pt == '\r') {
1000 *pt++ = '\n';
1001 *pt = '\0';
1002 tok->inp = pt;
1003 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001004 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001005 }
1006 if (tok->done != E_OK) {
1007 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001008 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001009 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001010 return EOF;
1011 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001012 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001013 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001014}
1015
1016
1017/* Back-up one character */
1018
1019static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001020tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001021{
1022 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001023 if (--tok->cur < tok->buf)
Benjamin Petersone3383b82009-11-07 01:04:38 +00001024 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025 if (*tok->cur != c)
1026 *tok->cur = c;
1027 }
1028}
1029
1030
1031/* Return the token corresponding to a single character */
1032
1033int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001034PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001035{
1036 switch (c) {
1037 case '(': return LPAR;
1038 case ')': return RPAR;
1039 case '[': return LSQB;
1040 case ']': return RSQB;
1041 case ':': return COLON;
1042 case ',': return COMMA;
1043 case ';': return SEMI;
1044 case '+': return PLUS;
1045 case '-': return MINUS;
1046 case '*': return STAR;
1047 case '/': return SLASH;
1048 case '|': return VBAR;
1049 case '&': return AMPER;
1050 case '<': return LESS;
1051 case '>': return GREATER;
1052 case '=': return EQUAL;
1053 case '.': return DOT;
1054 case '%': return PERCENT;
1055 case '`': return BACKQUOTE;
1056 case '{': return LBRACE;
1057 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001058 case '^': return CIRCUMFLEX;
1059 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001060 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061 default: return OP;
1062 }
1063}
1064
1065
Guido van Rossumfbab9051991-10-20 20:25:03 +00001066int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001067PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001068{
1069 switch (c1) {
1070 case '=':
1071 switch (c2) {
1072 case '=': return EQEQUAL;
1073 }
1074 break;
1075 case '!':
1076 switch (c2) {
1077 case '=': return NOTEQUAL;
1078 }
1079 break;
1080 case '<':
1081 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001082 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001083 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001084 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001085 }
1086 break;
1087 case '>':
1088 switch (c2) {
1089 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001090 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001091 }
1092 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001093 case '+':
1094 switch (c2) {
1095 case '=': return PLUSEQUAL;
1096 }
1097 break;
1098 case '-':
1099 switch (c2) {
1100 case '=': return MINEQUAL;
1101 }
1102 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001103 case '*':
1104 switch (c2) {
1105 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001106 case '=': return STAREQUAL;
1107 }
1108 break;
1109 case '/':
1110 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001111 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001112 case '=': return SLASHEQUAL;
1113 }
1114 break;
1115 case '|':
1116 switch (c2) {
1117 case '=': return VBAREQUAL;
1118 }
1119 break;
1120 case '%':
1121 switch (c2) {
1122 case '=': return PERCENTEQUAL;
1123 }
1124 break;
1125 case '&':
1126 switch (c2) {
1127 case '=': return AMPEREQUAL;
1128 }
1129 break;
1130 case '^':
1131 switch (c2) {
1132 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001133 }
1134 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001135 }
1136 return OP;
1137}
1138
Thomas Wouters434d0822000-08-24 20:11:32 +00001139int
1140PyToken_ThreeChars(int c1, int c2, int c3)
1141{
1142 switch (c1) {
1143 case '<':
1144 switch (c2) {
1145 case '<':
1146 switch (c3) {
1147 case '=':
1148 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001149 }
1150 break;
1151 }
1152 break;
1153 case '>':
1154 switch (c2) {
1155 case '>':
1156 switch (c3) {
1157 case '=':
1158 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001159 }
1160 break;
1161 }
1162 break;
1163 case '*':
1164 switch (c2) {
1165 case '*':
1166 switch (c3) {
1167 case '=':
1168 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001169 }
1170 break;
1171 }
1172 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001173 case '/':
1174 switch (c2) {
1175 case '/':
1176 switch (c3) {
1177 case '=':
1178 return DOUBLESLASHEQUAL;
1179 }
1180 break;
1181 }
1182 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001183 }
1184 return OP;
1185}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001186
Guido van Rossum926f13a1998-04-09 21:38:06 +00001187static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001188indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001189{
1190 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001191 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001192 tok->cur = tok->inp;
1193 return 1;
1194 }
1195 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001196 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1197 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001198 tok->altwarning = 0;
1199 }
1200 return 0;
1201}
1202
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203/* Get next token, after space stripping etc. */
1204
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001205static int
1206tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207{
1208 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001209 int blankline;
1210
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001211 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001212 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001213 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001214 blankline = 0;
1215
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216 /* Get indentation level */
1217 if (tok->atbol) {
1218 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001219 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 for (;;) {
1222 c = tok_nextc(tok);
1223 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001224 col++, altcol++;
1225 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001226 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001227 altcol = (altcol/tok->alttabsize + 1)
1228 * tok->alttabsize;
1229 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001230 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001231 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 else
1233 break;
1234 }
1235 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001236 if (c == '#' || c == '\n') {
1237 /* Lines with only whitespace and/or comments
1238 shouldn't affect the indentation and are
1239 not passed to the parser as NEWLINE tokens,
1240 except *totally* empty lines in interactive
1241 mode, which signal the end of a command group. */
1242 if (col == 0 && c == '\n' && tok->prompt != NULL)
1243 blankline = 0; /* Let it through */
1244 else
1245 blankline = 1; /* Ignore completely */
1246 /* We can't jump back right here since we still
1247 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001249 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001250 if (col == tok->indstack[tok->indent]) {
1251 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001252 if (altcol != tok->altindstack[tok->indent]) {
1253 if (indenterror(tok))
1254 return ERRORTOKEN;
1255 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001257 else if (col > tok->indstack[tok->indent]) {
1258 /* Indent -- always one */
1259 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001260 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001261 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001262 return ERRORTOKEN;
1263 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001264 if (altcol <= tok->altindstack[tok->indent]) {
1265 if (indenterror(tok))
1266 return ERRORTOKEN;
1267 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001268 tok->pendin++;
1269 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001272 else /* col < tok->indstack[tok->indent] */ {
1273 /* Dedent -- any number, must be consistent */
1274 while (tok->indent > 0 &&
1275 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001276 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001277 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001278 }
1279 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001280 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001281 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001282 return ERRORTOKEN;
1283 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001284 if (altcol != tok->altindstack[tok->indent]) {
1285 if (indenterror(tok))
1286 return ERRORTOKEN;
1287 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 }
1289 }
1290 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001291
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001292 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001293
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 /* Return pending indents/dedents */
1295 if (tok->pendin != 0) {
1296 if (tok->pendin < 0) {
1297 tok->pendin++;
1298 return DEDENT;
1299 }
1300 else {
1301 tok->pendin--;
1302 return INDENT;
1303 }
1304 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001305
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001307 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 /* Skip spaces */
1309 do {
1310 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001311 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001312
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001314 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001315
Guido van Rossumab5ca152000-03-31 00:52:27 +00001316 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001318 static char *tabforms[] = {
1319 "tab-width:", /* Emacs */
1320 ":tabstop=", /* vim, full form */
1321 ":ts=", /* vim, abbreviated form */
1322 "set tabsize=", /* will vi never die? */
1323 /* more templates can be added here to support other editors */
1324 };
1325 char cbuf[80];
1326 char *tp, **cp;
1327 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001329 *tp++ = c = tok_nextc(tok);
1330 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001331 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001332 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001333 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001334 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1335 cp++) {
1336 if ((tp = strstr(cbuf, *cp))) {
1337 int newsize = atoi(tp + strlen(*cp));
1338
1339 if (newsize >= 1 && newsize <= 40) {
1340 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001341 if (Py_VerboseFlag)
1342 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001343 "Tab size set to %d\n",
1344 newsize);
1345 }
1346 }
1347 }
1348 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001351
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001353 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001355 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001356
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 /* Identifier (most frequent token!) */
Benjamin Peterson4ceeeb02010-04-03 22:48:51 +00001358 if (ascii_isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001359 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001360 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001361 case 'b':
1362 case 'B':
1363 c = tok_nextc(tok);
1364 if (c == 'r' || c == 'R')
1365 c = tok_nextc(tok);
1366 if (c == '"' || c == '\'')
1367 goto letter_quote;
1368 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001369 case 'r':
1370 case 'R':
1371 c = tok_nextc(tok);
1372 if (c == '"' || c == '\'')
1373 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001374 break;
1375 case 'u':
1376 case 'U':
1377 c = tok_nextc(tok);
1378 if (c == 'r' || c == 'R')
1379 c = tok_nextc(tok);
1380 if (c == '"' || c == '\'')
1381 goto letter_quote;
1382 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001383 }
Benjamin Peterson4ceeeb02010-04-03 22:48:51 +00001384 while (ascii_isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001386 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001388 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389 *p_end = tok->cur;
1390 return NAME;
1391 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001392
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001393 /* Newline */
1394 if (c == '\n') {
1395 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001396 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001397 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001400 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 return NEWLINE;
1402 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001403
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001404 /* Period or number starting with period? */
1405 if (c == '.') {
1406 c = tok_nextc(tok);
1407 if (isdigit(c)) {
1408 goto fraction;
1409 }
1410 else {
1411 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001412 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001413 *p_end = tok->cur;
1414 return DOT;
1415 }
1416 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001417
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 /* Number */
1419 if (isdigit(c)) {
1420 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001421 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 c = tok_nextc(tok);
1423 if (c == '.')
1424 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001425#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001426 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001427 goto imaginary;
1428#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001429 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001430
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001431 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001432 c = tok_nextc(tok);
1433 if (!isxdigit(c)) {
1434 tok->done = E_TOKEN;
1435 tok_backup(tok, c);
1436 return ERRORTOKEN;
1437 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 do {
1439 c = tok_nextc(tok);
1440 } while (isxdigit(c));
1441 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001442 else if (c == 'o' || c == 'O') {
1443 /* Octal */
1444 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001445 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001446 tok->done = E_TOKEN;
1447 tok_backup(tok, c);
1448 return ERRORTOKEN;
1449 }
1450 do {
1451 c = tok_nextc(tok);
1452 } while ('0' <= c && c < '8');
1453 }
1454 else if (c == 'b' || c == 'B') {
1455 /* Binary */
1456 c = tok_nextc(tok);
1457 if (c != '0' && c != '1') {
1458 tok->done = E_TOKEN;
1459 tok_backup(tok, c);
1460 return ERRORTOKEN;
1461 }
1462 do {
1463 c = tok_nextc(tok);
1464 } while (c == '0' || c == '1');
1465 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001467 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001468 /* Octal; c is first char of it */
1469 /* There's no 'isoctdigit' macro, sigh */
1470 while ('0' <= c && c < '8') {
1471 c = tok_nextc(tok);
1472 }
Tim Petersd507dab2001-08-30 20:51:59 +00001473 if (isdigit(c)) {
1474 found_decimal = 1;
1475 do {
1476 c = tok_nextc(tok);
1477 } while (isdigit(c));
1478 }
1479 if (c == '.')
1480 goto fraction;
1481 else if (c == 'e' || c == 'E')
1482 goto exponent;
1483#ifndef WITHOUT_COMPLEX
1484 else if (c == 'j' || c == 'J')
1485 goto imaginary;
1486#endif
1487 else if (found_decimal) {
1488 tok->done = E_TOKEN;
1489 tok_backup(tok, c);
1490 return ERRORTOKEN;
1491 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001493 if (c == 'l' || c == 'L')
1494 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001495 }
1496 else {
1497 /* Decimal */
1498 do {
1499 c = tok_nextc(tok);
1500 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001501 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001503 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001504 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001505 if (c == '.') {
1506 fraction:
1507 /* Fraction */
1508 do {
1509 c = tok_nextc(tok);
1510 } while (isdigit(c));
1511 }
1512 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001513 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001514 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001515 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001516 if (c == '+' || c == '-')
1517 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001518 if (!isdigit(c)) {
1519 tok->done = E_TOKEN;
1520 tok_backup(tok, c);
1521 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001522 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001523 do {
1524 c = tok_nextc(tok);
1525 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001527#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001528 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001529 /* Imaginary part */
1530 imaginary:
1531 c = tok_nextc(tok);
1532#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001533 }
1534 }
1535 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001536 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001537 *p_end = tok->cur;
1538 return NUMBER;
1539 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001540
1541 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001542 /* String */
1543 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001544 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001545 int quote = c;
1546 int triple = 0;
1547 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001548 for (;;) {
1549 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001550 if (c == '\n') {
1551 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001552 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001553 tok_backup(tok, c);
1554 return ERRORTOKEN;
1555 }
1556 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001557 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001558 }
1559 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001560 if (triple)
1561 tok->done = E_EOFS;
1562 else
1563 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001564 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565 return ERRORTOKEN;
1566 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001567 else if (c == quote) {
1568 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001569 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001570 c = tok_nextc(tok);
1571 if (c == quote) {
1572 triple = 1;
1573 tripcount = 0;
1574 continue;
1575 }
1576 tok_backup(tok, c);
1577 }
1578 if (!triple || tripcount == 3)
1579 break;
1580 }
1581 else if (c == '\\') {
1582 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001583 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001584 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001585 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001586 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001587 return ERRORTOKEN;
1588 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001589 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001590 else
1591 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001592 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001593 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001594 *p_end = tok->cur;
1595 return STRING;
1596 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001597
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001598 /* Line continuation */
1599 if (c == '\\') {
1600 c = tok_nextc(tok);
1601 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001602 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001603 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001604 return ERRORTOKEN;
1605 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001606 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001607 goto again; /* Read next line */
1608 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001609
Guido van Rossumfbab9051991-10-20 20:25:03 +00001610 /* Check for two-character token */
1611 {
1612 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001613 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001614#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001615 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001616 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001617 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001618 tok->filename, tok->lineno,
1619 NULL, NULL)) {
1620 return ERRORTOKEN;
1621 }
1622 }
1623#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001624 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001625 int c3 = tok_nextc(tok);
1626 int token3 = PyToken_ThreeChars(c, c2, c3);
1627 if (token3 != OP) {
1628 token = token3;
1629 } else {
1630 tok_backup(tok, c3);
1631 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001632 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001633 *p_end = tok->cur;
1634 return token;
1635 }
1636 tok_backup(tok, c2);
1637 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001638
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001639 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001640 switch (c) {
1641 case '(':
1642 case '[':
1643 case '{':
1644 tok->level++;
1645 break;
1646 case ')':
1647 case ']':
1648 case '}':
1649 tok->level--;
1650 break;
1651 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001652
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001653 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001654 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001656 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001657}
1658
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001659int
1660PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1661{
1662 int result = tok_get(tok, p_start, p_end);
1663 if (tok->decoding_erred) {
1664 result = ERRORTOKEN;
1665 tok->done = E_DECODE;
1666 }
1667 return result;
1668}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001669
Martin v. Löwisa5136192007-09-04 14:19:28 +00001670/* This function is only called from parsetok. However, it cannot live
1671 there, as it must be empty for PGEN, and we can check for PGEN only
1672 in this file. */
1673
Christian Heimes082c9b02008-01-23 14:20:50 +00001674#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001675char*
1676PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1677{
1678 return NULL;
1679}
1680#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001681#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001682static PyObject *
1683dec_utf8(const char *enc, const char *text, size_t len) {
1684 PyObject *ret = NULL;
1685 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1686 if (unicode_text) {
1687 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1688 Py_DECREF(unicode_text);
1689 }
1690 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001691 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001692 }
1693 return ret;
1694}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001695char *
1696PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1697{
1698 char *text = NULL;
1699 if (tok->encoding) {
1700 /* convert source to original encondig */
1701 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1702 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001703 int linelen = PyString_Size(lineobj);
1704 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001705 text = PyObject_MALLOC(linelen + 1);
1706 if (text != NULL && line != NULL) {
1707 if (linelen)
1708 strncpy(text, line, linelen);
1709 text[linelen] = '\0';
1710 }
1711 Py_DECREF(lineobj);
1712
1713 /* adjust error offset */
1714 if (*offset > 1) {
1715 PyObject *offsetobj = dec_utf8(tok->encoding,
1716 tok->buf, *offset-1);
1717 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001718 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001719 Py_DECREF(offsetobj);
1720 }
1721 }
1722
1723 }
1724 }
1725 return text;
1726
1727}
Georg Brandl76b30d12008-01-07 18:41:34 +00001728#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001729#endif
1730
Martin v. Löwisa5136192007-09-04 14:19:28 +00001731
Guido van Rossum408027e1996-12-30 16:17:54 +00001732#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001733
1734void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001735tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001736{
Guido van Rossum86bea461997-04-29 21:03:06 +00001737 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001738 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1739 printf("(%.*s)", (int)(end - start), start);
1740}
1741
1742#endif