blob: b84a15437e48c30e1fda636d727bf5f760356410 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000160 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000202get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000204 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
223
224 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000225 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000226 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000227 t++;
228
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000233 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000234 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 int set_readline(struct tok_state *, const char *))
251{
Tim Peters17db21f2002-09-03 15:39:58 +0000252 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000254
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000258 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000267#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000273 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000274 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
Victor Stinner0217c952010-03-21 13:09:24 +0000307 int ch1, ch2, ch3;
308 ch1 = get_char(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309 tok->decoding_state = 1;
Victor Stinner0217c952010-03-21 13:09:24 +0000310 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000311 return 1;
Victor Stinner0217c952010-03-21 13:09:24 +0000312 } else if (ch1 == 0xEF) {
313 ch2 = get_char(tok);
314 if (ch2 != 0xBB) {
315 unget_char(ch2, tok);
316 unget_char(ch1, tok);
317 return 1;
318 }
319 ch3 = get_char(tok);
320 if (ch3 != 0xBF) {
321 unget_char(ch3, tok);
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326#if 0
327 /* Disable support for UTF-16 BOMs until a decision
328 is made whether this needs to be supported. */
Victor Stinner0217c952010-03-21 13:09:24 +0000329 } else if (ch1 == 0xFE) {
330 ch2 = get_char(tok);
331 if (ch2 != 0xFF) {
332 unget_char(ch2, tok);
333 unget_char(ch1, tok);
334 return 1;
335 }
336 if (!set_readline(tok, "utf-16-be"))
337 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 tok->decoding_state = -1;
Victor Stinner0217c952010-03-21 13:09:24 +0000339 } else if (ch1 == 0xFF) {
340 ch2 = get_char(tok);
341 if (ch2 != 0xFE) {
342 unget_char(ch2, tok);
343 unget_char(ch1, tok);
344 return 1;
345 }
346 if (!set_readline(tok, "utf-16-le"))
347 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 tok->decoding_state = -1;
349#endif
350 } else {
Victor Stinner0217c952010-03-21 13:09:24 +0000351 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352 return 1;
353 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000354 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000355 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
357 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358}
359
360/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000362
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 On entry, tok->decoding_buffer will be one of:
364 1) NULL: need to call tok->decoding_readline to get a new line
365 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
366 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000367 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 (in the s buffer) to copy entire contents of the line read
369 by tok->decoding_readline. tok->decoding_buffer has the overflow.
370 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000371 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 reached): see tok_nextc and its calls to decoding_fgets.
373*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374
375static char *
376fp_readl(char *s, int size, struct tok_state *tok)
377{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000378#ifndef Py_USING_UNICODE
379 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000380 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000381 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000382#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000386 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387
388 /* Ask for one less byte so we can terminate it */
389 assert(size > 0);
390 size--;
391
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000394 if (buf == NULL)
395 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000396 } else {
397 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000398 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 if (utf8 == NULL) {
402 utf8 = PyUnicode_AsUTF8String(buf);
403 Py_DECREF(buf);
404 if (utf8 == NULL)
405 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000406 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000407 str = PyString_AsString(utf8);
408 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000410 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411 if (tok->decoding_buffer == NULL) {
412 Py_DECREF(utf8);
413 return error_ret(tok);
414 }
415 utf8len = size;
416 }
417 memcpy(s, str, utf8len);
418 s[utf8len] = '\0';
419 Py_DECREF(utf8);
420 if (utf8len == 0) return NULL; /* EOF */
421 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000422#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423}
424
425/* Set the readline function for TOK to a StreamReader's
426 readline function. The StreamReader is named ENC.
427
428 This function is called from check_bom and check_coding_spec.
429
430 ENC is usually identical to the future value of tok->encoding,
431 except for the (currently unsupported) case of UTF-16.
432
433 Return 1 on success, 0 on failure. */
434
435static int
436fp_setreadl(struct tok_state *tok, const char* enc)
437{
438 PyObject *reader, *stream, *readline;
439
Martin v. Löwis95292d62002-12-11 14:04:59 +0000440 /* XXX: constify filename argument. */
441 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000442 if (stream == NULL)
443 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
445 reader = PyCodec_StreamReader(enc, stream, NULL);
446 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000447 if (reader == NULL)
448 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
450 readline = PyObject_GetAttrString(reader, "readline");
451 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000452 if (readline == NULL)
453 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
455 tok->decoding_readline = readline;
456 return 1;
457}
458
459/* Fetch the next byte from TOK. */
460
461static int fp_getc(struct tok_state *tok) {
462 return getc(tok->fp);
463}
464
465/* Unfetch the last byte back into TOK. */
466
467static void fp_ungetc(int c, struct tok_state *tok) {
468 ungetc(c, tok->fp);
469}
470
471/* Read a line of input from TOK. Determine encoding
472 if necessary. */
473
474static char *
475decoding_fgets(char *s, int size, struct tok_state *tok)
476{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000477 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000478 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000479 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480 if (tok->decoding_state < 0) {
481 /* We already have a codec associated with
482 this input. */
483 line = fp_readl(s, size, tok);
484 break;
485 } else if (tok->decoding_state > 0) {
486 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000487 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000489 break;
490 } else {
491 /* We have not yet determined the encoding.
492 If an encoding is found, use the file-pointer
493 reader functions from now on. */
494 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
495 return error_ret(tok);
496 assert(tok->decoding_state != 0);
497 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000498 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
500 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
501 return error_ret(tok);
502 }
503 }
504#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000505 /* The default encoding is ASCII, so make sure we don't have any
506 non-ASCII bytes in it. */
507 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000508 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000509 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510 if (*c > 127) {
511 badchar = *c;
512 break;
513 }
514 }
515 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000516 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000517 /* Need to add 1 to the line number, since this line
518 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000519 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000520 "Non-ASCII character '\\x%.2x' "
521 "in file %.200s on line %i, "
522 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000523 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000524 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000525 PyErr_SetString(PyExc_SyntaxError, buf);
526 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 }
528#endif
529 return line;
530}
531
532static int
533decoding_feof(struct tok_state *tok)
534{
535 if (tok->decoding_state >= 0) {
536 return feof(tok->fp);
537 } else {
538 PyObject* buf = tok->decoding_buffer;
539 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000540 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541 if (buf == NULL) {
542 error_ret(tok);
543 return 1;
544 } else {
545 tok->decoding_buffer = buf;
546 }
547 }
548 return PyObject_Length(buf) == 0;
549 }
550}
551
552/* Fetch a byte from TOK, using the string buffer. */
553
Tim Petersc9d78aa2006-03-26 23:27:58 +0000554static int
555buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000556 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557}
558
559/* Unfetch a byte from TOK, using the string buffer. */
560
Tim Petersc9d78aa2006-03-26 23:27:58 +0000561static void
562buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000564 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565}
566
567/* Set the readline function for TOK to ENC. For the string-based
568 tokenizer, this means to just record the encoding. */
569
Tim Petersc9d78aa2006-03-26 23:27:58 +0000570static int
571buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572 tok->enc = enc;
573 return 1;
574}
575
576/* Return a UTF-8 encoding Python string object from the
577 C byte string STR, which is encoded with ENC. */
578
Martin v. Löwis019934b2002-08-07 12:33:18 +0000579#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580static PyObject *
581translate_into_utf8(const char* str, const char* enc) {
582 PyObject *utf8;
583 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
584 if (buf == NULL)
585 return NULL;
586 utf8 = PyUnicode_AsUTF8String(buf);
587 Py_DECREF(buf);
588 return utf8;
589}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000590#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591
592/* Decode a byte string STR for use as the buffer of TOK.
593 Look for encoding declarations inside STR, and record them
594 inside TOK. */
595
596static const char *
597decode_str(const char *str, struct tok_state *tok)
598{
599 PyObject* utf8 = NULL;
600 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000601 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 int lineno = 0;
603 tok->enc = NULL;
604 tok->str = str;
605 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000606 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000608 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000609#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000610 if (tok->enc != NULL) {
611 utf8 = translate_into_utf8(str, tok->enc);
612 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000613 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000614 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000616#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000617 for (s = str;; s++) {
618 if (*s == '\0') break;
619 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000620 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000621 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 lineno++;
623 if (lineno == 2) break;
624 }
625 }
626 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000627 /* need to check line 1 and 2 separately since check_coding_spec
628 assumes a single line as input */
629 if (newl[0]) {
630 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
631 return error_ret(tok);
632 if (tok->enc == NULL && newl[1]) {
633 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
634 tok, buf_setreadl))
635 return error_ret(tok);
636 }
637 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000638#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 if (tok->enc != NULL) {
640 assert(utf8 == NULL);
641 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000642 if (utf8 == NULL) {
643 PyErr_Format(PyExc_SyntaxError,
644 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000645 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000646 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000647 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000649#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 assert(tok->decoding_buffer == NULL);
651 tok->decoding_buffer = utf8; /* CAUTION */
652 return str;
653}
654
655#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656
657/* Set up tokenizer for string */
658
659struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000660PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661{
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 if (str == NULL) {
667 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000669 }
670
Martin v. Löwis95292d62002-12-11 14:04:59 +0000671 /* XXX: constify members. */
672 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 return tok;
674}
675
676
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000677/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678
679struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000680PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681{
682 struct tok_state *tok = tok_new();
683 if (tok == NULL)
684 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000685 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000686 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687 return NULL;
688 }
689 tok->cur = tok->inp = tok->buf;
690 tok->end = tok->buf + BUFSIZ;
691 tok->fp = fp;
692 tok->prompt = ps1;
693 tok->nextprompt = ps2;
694 return tok;
695}
696
697
698/* Free a tok_state structure */
699
700void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000701PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000704 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000705#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000706 Py_XDECREF(tok->decoding_readline);
707 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000708#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000709 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000710 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000711 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712}
713
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000714#if !defined(PGEN) && defined(Py_USING_UNICODE)
715static int
716tok_stdin_decode(struct tok_state *tok, char **inp)
717{
718 PyObject *enc, *sysstdin, *decoded, *utf8;
719 const char *encoding;
720 char *converted;
721
722 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
723 return 0;
724 sysstdin = PySys_GetObject("stdin");
725 if (sysstdin == NULL || !PyFile_Check(sysstdin))
726 return 0;
727
728 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000729 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000730 return 0;
731 Py_INCREF(enc);
732
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000733 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000734 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
735 if (decoded == NULL)
736 goto error_clear;
737
738 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
739 Py_DECREF(decoded);
740 if (utf8 == NULL)
741 goto error_clear;
742
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000743 assert(PyString_Check(utf8));
744 converted = new_string(PyString_AS_STRING(utf8),
745 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000746 Py_DECREF(utf8);
747 if (converted == NULL)
748 goto error_nomem;
749
Neal Norwitz08062d62006-04-11 08:19:15 +0000750 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000751 *inp = converted;
752 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000753 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000754 tok->encoding = new_string(encoding, strlen(encoding));
755 if (tok->encoding == NULL)
756 goto error_nomem;
757
758 Py_DECREF(enc);
759 return 0;
760
761error_nomem:
762 Py_DECREF(enc);
763 tok->done = E_NOMEM;
764 return -1;
765
766error_clear:
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000767 Py_DECREF(enc);
Victor Stinnerea164292010-03-21 14:02:32 +0000768 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
769 tok->done = E_ERROR;
770 return -1;
771 }
772 /* Fallback to iso-8859-1: for backward compatibility */
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000773 PyErr_Clear();
774 return 0;
775}
776#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777
778/* Get next char, updating state; error code goes into tok->done */
779
780static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000781tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000782{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000784 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000785 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000786 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000787 if (tok->done != E_OK)
788 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000790 char *end = strchr(tok->inp, '\n');
791 if (end != NULL)
792 end++;
793 else {
794 end = strchr(tok->inp, '\0');
795 if (end == tok->inp) {
796 tok->done = E_EOF;
797 return EOF;
798 }
799 }
800 if (tok->start == NULL)
801 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000802 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000803 tok->lineno++;
804 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000805 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000808 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809 if (tok->nextprompt != NULL)
810 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000811 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000812 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000813 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000814 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000815 tok->done = E_EOF;
816 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000817#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000818 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000819 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000820#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000822 size_t start = tok->start - tok->buf;
823 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000824 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000825 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000826 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 tok->lineno++;
828 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000829 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000830 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000831 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000832 tok->done = E_NOMEM;
833 return EOF;
834 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000835 tok->buf = buf;
836 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000837 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000838 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000839 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 tok->inp = tok->buf + newlen;
841 tok->end = tok->inp + 1;
842 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000843 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 else {
845 tok->lineno++;
846 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000847 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000848 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000849 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000850 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000851 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000852 tok->inp = strchr(tok->buf, '\0');
853 tok->end = tok->inp + 1;
854 }
855 }
856 else {
857 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000859 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 if (tok->start == NULL) {
861 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000862 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000863 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 if (tok->buf == NULL) {
865 tok->done = E_NOMEM;
866 return EOF;
867 }
868 tok->end = tok->buf + BUFSIZ;
869 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000870 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
871 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000872 tok->done = E_EOF;
873 done = 1;
874 }
875 else {
876 tok->done = E_OK;
877 tok->inp = strchr(tok->buf, '\0');
878 done = tok->inp[-1] == '\n';
879 }
880 }
881 else {
882 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000883 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000884 tok->done = E_EOF;
885 done = 1;
886 }
887 else
888 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 }
890 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000891 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000893 Py_ssize_t curstart = tok->start == NULL ? -1 :
894 tok->start - tok->buf;
895 Py_ssize_t curvalid = tok->inp - tok->buf;
896 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000897 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000898 newbuf = (char *)PyMem_REALLOC(newbuf,
899 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 if (newbuf == NULL) {
901 tok->done = E_NOMEM;
902 tok->cur = tok->inp;
903 return EOF;
904 }
905 tok->buf = newbuf;
906 tok->inp = tok->buf + curvalid;
907 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000908 tok->start = curstart < 0 ? NULL :
909 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000910 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000911 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000912 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000913 /* Break out early on decoding
914 errors, as tok->buf will be NULL
915 */
916 if (tok->decoding_erred)
917 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000918 /* Last line does not end in \n,
919 fake one */
920 strcpy(tok->inp, "\n");
921 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000922 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000923 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000924 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000925 if (tok->buf != NULL) {
926 tok->cur = tok->buf + cur;
927 tok->line_start = tok->cur;
928 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000929 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000930 pt = tok->inp - 2;
931 if (pt >= tok->buf && *pt == '\r') {
932 *pt++ = '\n';
933 *pt = '\0';
934 tok->inp = pt;
935 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000936 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000937 }
938 if (tok->done != E_OK) {
939 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000940 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942 return EOF;
943 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000944 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000945 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946}
947
948
949/* Back-up one character */
950
951static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000952tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953{
954 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000955 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000956 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957 if (*tok->cur != c)
958 *tok->cur = c;
959 }
960}
961
962
963/* Return the token corresponding to a single character */
964
965int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000966PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000967{
968 switch (c) {
969 case '(': return LPAR;
970 case ')': return RPAR;
971 case '[': return LSQB;
972 case ']': return RSQB;
973 case ':': return COLON;
974 case ',': return COMMA;
975 case ';': return SEMI;
976 case '+': return PLUS;
977 case '-': return MINUS;
978 case '*': return STAR;
979 case '/': return SLASH;
980 case '|': return VBAR;
981 case '&': return AMPER;
982 case '<': return LESS;
983 case '>': return GREATER;
984 case '=': return EQUAL;
985 case '.': return DOT;
986 case '%': return PERCENT;
987 case '`': return BACKQUOTE;
988 case '{': return LBRACE;
989 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000990 case '^': return CIRCUMFLEX;
991 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000992 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000993 default: return OP;
994 }
995}
996
997
Guido van Rossumfbab9051991-10-20 20:25:03 +0000998int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000999PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001000{
1001 switch (c1) {
1002 case '=':
1003 switch (c2) {
1004 case '=': return EQEQUAL;
1005 }
1006 break;
1007 case '!':
1008 switch (c2) {
1009 case '=': return NOTEQUAL;
1010 }
1011 break;
1012 case '<':
1013 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001014 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001015 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001016 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001017 }
1018 break;
1019 case '>':
1020 switch (c2) {
1021 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001022 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001023 }
1024 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001025 case '+':
1026 switch (c2) {
1027 case '=': return PLUSEQUAL;
1028 }
1029 break;
1030 case '-':
1031 switch (c2) {
1032 case '=': return MINEQUAL;
1033 }
1034 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001035 case '*':
1036 switch (c2) {
1037 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001038 case '=': return STAREQUAL;
1039 }
1040 break;
1041 case '/':
1042 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001043 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001044 case '=': return SLASHEQUAL;
1045 }
1046 break;
1047 case '|':
1048 switch (c2) {
1049 case '=': return VBAREQUAL;
1050 }
1051 break;
1052 case '%':
1053 switch (c2) {
1054 case '=': return PERCENTEQUAL;
1055 }
1056 break;
1057 case '&':
1058 switch (c2) {
1059 case '=': return AMPEREQUAL;
1060 }
1061 break;
1062 case '^':
1063 switch (c2) {
1064 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001065 }
1066 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001067 }
1068 return OP;
1069}
1070
Thomas Wouters434d0822000-08-24 20:11:32 +00001071int
1072PyToken_ThreeChars(int c1, int c2, int c3)
1073{
1074 switch (c1) {
1075 case '<':
1076 switch (c2) {
1077 case '<':
1078 switch (c3) {
1079 case '=':
1080 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001081 }
1082 break;
1083 }
1084 break;
1085 case '>':
1086 switch (c2) {
1087 case '>':
1088 switch (c3) {
1089 case '=':
1090 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001091 }
1092 break;
1093 }
1094 break;
1095 case '*':
1096 switch (c2) {
1097 case '*':
1098 switch (c3) {
1099 case '=':
1100 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001101 }
1102 break;
1103 }
1104 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001105 case '/':
1106 switch (c2) {
1107 case '/':
1108 switch (c3) {
1109 case '=':
1110 return DOUBLESLASHEQUAL;
1111 }
1112 break;
1113 }
1114 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001115 }
1116 return OP;
1117}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001118
Guido van Rossum926f13a1998-04-09 21:38:06 +00001119static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001120indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001121{
1122 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001123 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001124 tok->cur = tok->inp;
1125 return 1;
1126 }
1127 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001128 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1129 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001130 tok->altwarning = 0;
1131 }
1132 return 0;
1133}
1134
1135
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136/* Get next token, after space stripping etc. */
1137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001138static int
1139tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001140{
1141 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001142 int blankline;
1143
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001144 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001145 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001146 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001147 blankline = 0;
1148
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 /* Get indentation level */
1150 if (tok->atbol) {
1151 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001152 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001154 for (;;) {
1155 c = tok_nextc(tok);
1156 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001157 col++, altcol++;
1158 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160 altcol = (altcol/tok->alttabsize + 1)
1161 * tok->alttabsize;
1162 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001163 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001164 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 else
1166 break;
1167 }
1168 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001169 if (c == '#' || c == '\n') {
1170 /* Lines with only whitespace and/or comments
1171 shouldn't affect the indentation and are
1172 not passed to the parser as NEWLINE tokens,
1173 except *totally* empty lines in interactive
1174 mode, which signal the end of a command group. */
1175 if (col == 0 && c == '\n' && tok->prompt != NULL)
1176 blankline = 0; /* Let it through */
1177 else
1178 blankline = 1; /* Ignore completely */
1179 /* We can't jump back right here since we still
1180 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001181 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001182 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001183 if (col == tok->indstack[tok->indent]) {
1184 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001185 if (altcol != tok->altindstack[tok->indent]) {
1186 if (indenterror(tok))
1187 return ERRORTOKEN;
1188 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001190 else if (col > tok->indstack[tok->indent]) {
1191 /* Indent -- always one */
1192 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001193 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001194 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001195 return ERRORTOKEN;
1196 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001197 if (altcol <= tok->altindstack[tok->indent]) {
1198 if (indenterror(tok))
1199 return ERRORTOKEN;
1200 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001201 tok->pendin++;
1202 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001203 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205 else /* col < tok->indstack[tok->indent] */ {
1206 /* Dedent -- any number, must be consistent */
1207 while (tok->indent > 0 &&
1208 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001209 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001210 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001211 }
1212 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001213 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001214 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001215 return ERRORTOKEN;
1216 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001217 if (altcol != tok->altindstack[tok->indent]) {
1218 if (indenterror(tok))
1219 return ERRORTOKEN;
1220 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 }
1222 }
1223 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001224
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001225 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001226
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227 /* Return pending indents/dedents */
1228 if (tok->pendin != 0) {
1229 if (tok->pendin < 0) {
1230 tok->pendin++;
1231 return DEDENT;
1232 }
1233 else {
1234 tok->pendin--;
1235 return INDENT;
1236 }
1237 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001238
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001239 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001240 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001241 /* Skip spaces */
1242 do {
1243 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001244 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001245
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001247 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001248
Guido van Rossumab5ca152000-03-31 00:52:27 +00001249 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001251 static char *tabforms[] = {
1252 "tab-width:", /* Emacs */
1253 ":tabstop=", /* vim, full form */
1254 ":ts=", /* vim, abbreviated form */
1255 "set tabsize=", /* will vi never die? */
1256 /* more templates can be added here to support other editors */
1257 };
1258 char cbuf[80];
1259 char *tp, **cp;
1260 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001262 *tp++ = c = tok_nextc(tok);
1263 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001264 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001265 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001266 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001267 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1268 cp++) {
1269 if ((tp = strstr(cbuf, *cp))) {
1270 int newsize = atoi(tp + strlen(*cp));
1271
1272 if (newsize >= 1 && newsize <= 40) {
1273 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001274 if (Py_VerboseFlag)
1275 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001276 "Tab size set to %d\n",
1277 newsize);
1278 }
1279 }
1280 }
1281 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001284
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001286 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001288 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 /* Identifier (most frequent token!) */
1291 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001292 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001293 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001294 case 'b':
1295 case 'B':
1296 c = tok_nextc(tok);
1297 if (c == 'r' || c == 'R')
1298 c = tok_nextc(tok);
1299 if (c == '"' || c == '\'')
1300 goto letter_quote;
1301 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001302 case 'r':
1303 case 'R':
1304 c = tok_nextc(tok);
1305 if (c == '"' || c == '\'')
1306 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001307 break;
1308 case 'u':
1309 case 'U':
1310 c = tok_nextc(tok);
1311 if (c == 'r' || c == 'R')
1312 c = tok_nextc(tok);
1313 if (c == '"' || c == '\'')
1314 goto letter_quote;
1315 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001316 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001317 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001319 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001321 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 *p_end = tok->cur;
1323 return NAME;
1324 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001325
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 /* Newline */
1327 if (c == '\n') {
1328 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001329 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001330 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001331 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001332 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001333 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 return NEWLINE;
1335 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001336
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001337 /* Period or number starting with period? */
1338 if (c == '.') {
1339 c = tok_nextc(tok);
1340 if (isdigit(c)) {
1341 goto fraction;
1342 }
1343 else {
1344 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001345 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001346 *p_end = tok->cur;
1347 return DOT;
1348 }
1349 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 /* Number */
1352 if (isdigit(c)) {
1353 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001354 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355 c = tok_nextc(tok);
1356 if (c == '.')
1357 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001358#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001359 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001360 goto imaginary;
1361#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001363
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001365 c = tok_nextc(tok);
1366 if (!isxdigit(c)) {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 do {
1372 c = tok_nextc(tok);
1373 } while (isxdigit(c));
1374 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001375 else if (c == 'o' || c == 'O') {
1376 /* Octal */
1377 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001378 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001379 tok->done = E_TOKEN;
1380 tok_backup(tok, c);
1381 return ERRORTOKEN;
1382 }
1383 do {
1384 c = tok_nextc(tok);
1385 } while ('0' <= c && c < '8');
1386 }
1387 else if (c == 'b' || c == 'B') {
1388 /* Binary */
1389 c = tok_nextc(tok);
1390 if (c != '0' && c != '1') {
1391 tok->done = E_TOKEN;
1392 tok_backup(tok, c);
1393 return ERRORTOKEN;
1394 }
1395 do {
1396 c = tok_nextc(tok);
1397 } while (c == '0' || c == '1');
1398 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001400 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 /* Octal; c is first char of it */
1402 /* There's no 'isoctdigit' macro, sigh */
1403 while ('0' <= c && c < '8') {
1404 c = tok_nextc(tok);
1405 }
Tim Petersd507dab2001-08-30 20:51:59 +00001406 if (isdigit(c)) {
1407 found_decimal = 1;
1408 do {
1409 c = tok_nextc(tok);
1410 } while (isdigit(c));
1411 }
1412 if (c == '.')
1413 goto fraction;
1414 else if (c == 'e' || c == 'E')
1415 goto exponent;
1416#ifndef WITHOUT_COMPLEX
1417 else if (c == 'j' || c == 'J')
1418 goto imaginary;
1419#endif
1420 else if (found_decimal) {
1421 tok->done = E_TOKEN;
1422 tok_backup(tok, c);
1423 return ERRORTOKEN;
1424 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001426 if (c == 'l' || c == 'L')
1427 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001428 }
1429 else {
1430 /* Decimal */
1431 do {
1432 c = tok_nextc(tok);
1433 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001434 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001436 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001437 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001438 if (c == '.') {
1439 fraction:
1440 /* Fraction */
1441 do {
1442 c = tok_nextc(tok);
1443 } while (isdigit(c));
1444 }
1445 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001446 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001447 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001448 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001449 if (c == '+' || c == '-')
1450 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001451 if (!isdigit(c)) {
1452 tok->done = E_TOKEN;
1453 tok_backup(tok, c);
1454 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001455 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001456 do {
1457 c = tok_nextc(tok);
1458 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001460#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001461 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001462 /* Imaginary part */
1463 imaginary:
1464 c = tok_nextc(tok);
1465#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 }
1467 }
1468 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001469 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001470 *p_end = tok->cur;
1471 return NUMBER;
1472 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001473
1474 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001475 /* String */
1476 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001477 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001478 int quote = c;
1479 int triple = 0;
1480 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 for (;;) {
1482 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001483 if (c == '\n') {
1484 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001485 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001486 tok_backup(tok, c);
1487 return ERRORTOKEN;
1488 }
1489 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001490 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001491 }
1492 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001493 if (triple)
1494 tok->done = E_EOFS;
1495 else
1496 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001497 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498 return ERRORTOKEN;
1499 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001500 else if (c == quote) {
1501 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001502 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001503 c = tok_nextc(tok);
1504 if (c == quote) {
1505 triple = 1;
1506 tripcount = 0;
1507 continue;
1508 }
1509 tok_backup(tok, c);
1510 }
1511 if (!triple || tripcount == 3)
1512 break;
1513 }
1514 else if (c == '\\') {
1515 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001517 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001518 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001519 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001520 return ERRORTOKEN;
1521 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001522 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001523 else
1524 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001525 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001526 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001527 *p_end = tok->cur;
1528 return STRING;
1529 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001530
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001531 /* Line continuation */
1532 if (c == '\\') {
1533 c = tok_nextc(tok);
1534 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001535 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001536 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001537 return ERRORTOKEN;
1538 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001539 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001540 goto again; /* Read next line */
1541 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001542
Guido van Rossumfbab9051991-10-20 20:25:03 +00001543 /* Check for two-character token */
1544 {
1545 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001546 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001547#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001548 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001549 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001550 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001551 tok->filename, tok->lineno,
1552 NULL, NULL)) {
1553 return ERRORTOKEN;
1554 }
1555 }
1556#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001557 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001558 int c3 = tok_nextc(tok);
1559 int token3 = PyToken_ThreeChars(c, c2, c3);
1560 if (token3 != OP) {
1561 token = token3;
1562 } else {
1563 tok_backup(tok, c3);
1564 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001565 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001566 *p_end = tok->cur;
1567 return token;
1568 }
1569 tok_backup(tok, c2);
1570 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001571
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001572 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001573 switch (c) {
1574 case '(':
1575 case '[':
1576 case '{':
1577 tok->level++;
1578 break;
1579 case ')':
1580 case ']':
1581 case '}':
1582 tok->level--;
1583 break;
1584 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001585
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001586 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001587 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001588 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001589 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001590}
1591
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001592int
1593PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1594{
1595 int result = tok_get(tok, p_start, p_end);
1596 if (tok->decoding_erred) {
1597 result = ERRORTOKEN;
1598 tok->done = E_DECODE;
1599 }
1600 return result;
1601}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001602
Martin v. Löwisa5136192007-09-04 14:19:28 +00001603/* This function is only called from parsetok. However, it cannot live
1604 there, as it must be empty for PGEN, and we can check for PGEN only
1605 in this file. */
1606
Christian Heimes082c9b02008-01-23 14:20:50 +00001607#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001608char*
1609PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1610{
1611 return NULL;
1612}
1613#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001614#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001615static PyObject *
1616dec_utf8(const char *enc, const char *text, size_t len) {
1617 PyObject *ret = NULL;
1618 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1619 if (unicode_text) {
1620 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1621 Py_DECREF(unicode_text);
1622 }
1623 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001624 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001625 }
1626 return ret;
1627}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001628char *
1629PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1630{
1631 char *text = NULL;
1632 if (tok->encoding) {
1633 /* convert source to original encondig */
1634 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1635 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001636 int linelen = PyString_Size(lineobj);
1637 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001638 text = PyObject_MALLOC(linelen + 1);
1639 if (text != NULL && line != NULL) {
1640 if (linelen)
1641 strncpy(text, line, linelen);
1642 text[linelen] = '\0';
1643 }
1644 Py_DECREF(lineobj);
1645
1646 /* adjust error offset */
1647 if (*offset > 1) {
1648 PyObject *offsetobj = dec_utf8(tok->encoding,
1649 tok->buf, *offset-1);
1650 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001651 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001652 Py_DECREF(offsetobj);
1653 }
1654 }
1655
1656 }
1657 }
1658 return text;
1659
1660}
Georg Brandl76b30d12008-01-07 18:41:34 +00001661#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001662#endif
1663
Martin v. Löwisa5136192007-09-04 14:19:28 +00001664
Guido van Rossum408027e1996-12-30 16:17:54 +00001665#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001666
1667void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001668tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001669{
Guido van Rossum86bea461997-04-29 21:03:06 +00001670 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001671 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1672 printf("(%.*s)", (int)(end - start), start);
1673}
1674
1675#endif