blob: 707e76291c47983f4359813a4a7ccf13a09f9985 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000160 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000183 if (c == '\0')
184 break;
185 else if (c == '_')
186 buf[i] = '-';
187 else
188 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189 }
190 buf[i] = '\0';
191 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000192 strncmp(buf, "utf-8-", 6) == 0)
193 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194 else if (strcmp(buf, "latin-1") == 0 ||
195 strcmp(buf, "iso-8859-1") == 0 ||
196 strcmp(buf, "iso-latin-1") == 0 ||
197 strncmp(buf, "latin-1-", 8) == 0 ||
198 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000199 strncmp(buf, "iso-latin-1-", 12) == 0)
200 return "iso-8859-1";
201 else
202 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000208get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000231 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000239 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000279 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000286 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000287#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000291 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000318 ch = get_char(tok);
319 if (ch != 0xBB)
320 goto NON_BOM;
321 ch = get_char(tok);
322 if (ch != 0xBF)
323 goto NON_BOM;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000324#if 0
325 /* Disable support for UTF-16 BOMs until a decision
326 is made whether this needs to be supported. */
327 } else if (ch == 0xFE) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000328 ch = get_char(tok);
329 if (ch != 0xFF)
330 goto NON_BOM;
331 if (!set_readline(tok, "utf-16-be"))
332 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333 tok->decoding_state = -1;
334 } else if (ch == 0xFF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000335 ch = get_char(tok);
336 if (ch != 0xFE)
337 goto NON_BOM;
338 if (!set_readline(tok, "utf-16-le"))
339 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 tok->decoding_state = -1;
341#endif
342 } else {
343 unget_char(ch, tok);
344 return 1;
345 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000346 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000347 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
349 return 1;
350 NON_BOM:
351 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
352 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
353 return 1;
354}
355
356/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000358
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000363 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000367 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 reached): see tok_nextc and its calls to decoding_fgets.
369*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370
371static char *
372fp_readl(char *s, int size, struct tok_state *tok)
373{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000374#ifndef Py_USING_UNICODE
375 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000376 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000377 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000378#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000382 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383
384 /* Ask for one less byte so we can terminate it */
385 assert(size > 0);
386 size--;
387
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000388 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000390 if (buf == NULL)
391 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 } else {
393 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000394 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000396 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397 if (utf8 == NULL) {
398 utf8 = PyUnicode_AsUTF8String(buf);
399 Py_DECREF(buf);
400 if (utf8 == NULL)
401 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000402 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000403 str = PyString_AsString(utf8);
404 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000405 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000406 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407 if (tok->decoding_buffer == NULL) {
408 Py_DECREF(utf8);
409 return error_ret(tok);
410 }
411 utf8len = size;
412 }
413 memcpy(s, str, utf8len);
414 s[utf8len] = '\0';
415 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000416 if (utf8len == 0)
417 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000418 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000419#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420}
421
422/* Set the readline function for TOK to a StreamReader's
423 readline function. The StreamReader is named ENC.
424
425 This function is called from check_bom and check_coding_spec.
426
427 ENC is usually identical to the future value of tok->encoding,
428 except for the (currently unsupported) case of UTF-16.
429
430 Return 1 on success, 0 on failure. */
431
432static int
433fp_setreadl(struct tok_state *tok, const char* enc)
434{
435 PyObject *reader, *stream, *readline;
436
Martin v. Löwis95292d62002-12-11 14:04:59 +0000437 /* XXX: constify filename argument. */
438 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000439 if (stream == NULL)
440 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441
442 reader = PyCodec_StreamReader(enc, stream, NULL);
443 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000444 if (reader == NULL)
445 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000446
447 readline = PyObject_GetAttrString(reader, "readline");
448 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000449 if (readline == NULL)
450 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000451
452 tok->decoding_readline = readline;
453 return 1;
454}
455
456/* Fetch the next byte from TOK. */
457
458static int fp_getc(struct tok_state *tok) {
459 return getc(tok->fp);
460}
461
462/* Unfetch the last byte back into TOK. */
463
464static void fp_ungetc(int c, struct tok_state *tok) {
465 ungetc(c, tok->fp);
466}
467
468/* Read a line of input from TOK. Determine encoding
469 if necessary. */
470
471static char *
472decoding_fgets(char *s, int size, struct tok_state *tok)
473{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000474 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000475 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000476 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477 if (tok->decoding_state < 0) {
478 /* We already have a codec associated with
479 this input. */
480 line = fp_readl(s, size, tok);
481 break;
482 } else if (tok->decoding_state > 0) {
483 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000484 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486 break;
487 } else {
488 /* We have not yet determined the encoding.
489 If an encoding is found, use the file-pointer
490 reader functions from now on. */
491 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
492 return error_ret(tok);
493 assert(tok->decoding_state != 0);
494 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000495 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
497 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
498 return error_ret(tok);
499 }
500 }
501#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000502 /* The default encoding is ASCII, so make sure we don't have any
503 non-ASCII bytes in it. */
504 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000506 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 if (*c > 127) {
508 badchar = *c;
509 break;
510 }
511 }
512 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000513 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000514 /* Need to add 1 to the line number, since this line
515 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000516 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000517 "Non-ASCII character '\\x%.2x' "
518 "in file %.200s on line %i, "
519 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000520 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000521 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000522 PyErr_SetString(PyExc_SyntaxError, buf);
523 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 }
525#endif
526 return line;
527}
528
529static int
530decoding_feof(struct tok_state *tok)
531{
532 if (tok->decoding_state >= 0) {
533 return feof(tok->fp);
534 } else {
535 PyObject* buf = tok->decoding_buffer;
536 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000537 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538 if (buf == NULL) {
539 error_ret(tok);
540 return 1;
541 } else {
542 tok->decoding_buffer = buf;
543 }
544 }
545 return PyObject_Length(buf) == 0;
546 }
547}
548
549/* Fetch a byte from TOK, using the string buffer. */
550
Tim Petersc9d78aa2006-03-26 23:27:58 +0000551static int
552buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000553 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554}
555
556/* Unfetch a byte from TOK, using the string buffer. */
557
Tim Petersc9d78aa2006-03-26 23:27:58 +0000558static void
559buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000561 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562}
563
564/* Set the readline function for TOK to ENC. For the string-based
565 tokenizer, this means to just record the encoding. */
566
Tim Petersc9d78aa2006-03-26 23:27:58 +0000567static int
568buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000569 tok->enc = enc;
570 return 1;
571}
572
573/* Return a UTF-8 encoding Python string object from the
574 C byte string STR, which is encoded with ENC. */
575
Martin v. Löwis019934b2002-08-07 12:33:18 +0000576#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577static PyObject *
578translate_into_utf8(const char* str, const char* enc) {
579 PyObject *utf8;
580 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
581 if (buf == NULL)
582 return NULL;
583 utf8 = PyUnicode_AsUTF8String(buf);
584 Py_DECREF(buf);
585 return utf8;
586}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000587#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588
589/* Decode a byte string STR for use as the buffer of TOK.
590 Look for encoding declarations inside STR, and record them
591 inside TOK. */
592
593static const char *
594decode_str(const char *str, struct tok_state *tok)
595{
596 PyObject* utf8 = NULL;
597 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000598 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 int lineno = 0;
600 tok->enc = NULL;
601 tok->str = str;
602 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000603 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000605 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000606#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 if (tok->enc != NULL) {
608 utf8 = translate_into_utf8(str, tok->enc);
609 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000610 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000611 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000612 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000613#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614 for (s = str;; s++) {
615 if (*s == '\0') break;
616 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000617 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000618 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 lineno++;
620 if (lineno == 2) break;
621 }
622 }
623 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000624 /* need to check line 1 and 2 separately since check_coding_spec
625 assumes a single line as input */
626 if (newl[0]) {
627 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
628 return error_ret(tok);
629 if (tok->enc == NULL && newl[1]) {
630 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
631 tok, buf_setreadl))
632 return error_ret(tok);
633 }
634 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000635#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 if (tok->enc != NULL) {
637 assert(utf8 == NULL);
638 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000639 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000641 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000643#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 assert(tok->decoding_buffer == NULL);
645 tok->decoding_buffer = utf8; /* CAUTION */
646 return str;
647}
648
649#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650
651/* Set up tokenizer for string */
652
653struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000654PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655{
656 struct tok_state *tok = tok_new();
657 if (tok == NULL)
658 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000660 if (str == NULL) {
661 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000663 }
664
Martin v. Löwis95292d62002-12-11 14:04:59 +0000665 /* XXX: constify members. */
666 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 return tok;
668}
669
670
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000671/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672
673struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000674PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675{
676 struct tok_state *tok = tok_new();
677 if (tok == NULL)
678 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000679 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000680 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681 return NULL;
682 }
683 tok->cur = tok->inp = tok->buf;
684 tok->end = tok->buf + BUFSIZ;
685 tok->fp = fp;
686 tok->prompt = ps1;
687 tok->nextprompt = ps2;
688 return tok;
689}
690
691
692/* Free a tok_state structure */
693
694void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000695PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000696{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000697 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000698 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000699#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700 Py_XDECREF(tok->decoding_readline);
701 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000702#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000704 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000705 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706}
707
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000708#if !defined(PGEN) && defined(Py_USING_UNICODE)
709static int
710tok_stdin_decode(struct tok_state *tok, char **inp)
711{
712 PyObject *enc, *sysstdin, *decoded, *utf8;
713 const char *encoding;
714 char *converted;
715
716 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
717 return 0;
718 sysstdin = PySys_GetObject("stdin");
719 if (sysstdin == NULL || !PyFile_Check(sysstdin))
720 return 0;
721
722 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000723 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000724 return 0;
725 Py_INCREF(enc);
726
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000727 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000728 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
729 if (decoded == NULL)
730 goto error_clear;
731
732 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
733 Py_DECREF(decoded);
734 if (utf8 == NULL)
735 goto error_clear;
736
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000737 assert(PyString_Check(utf8));
738 converted = new_string(PyString_AS_STRING(utf8),
739 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000740 Py_DECREF(utf8);
741 if (converted == NULL)
742 goto error_nomem;
743
Neal Norwitz08062d62006-04-11 08:19:15 +0000744 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000745 *inp = converted;
746 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000747 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000748 tok->encoding = new_string(encoding, strlen(encoding));
749 if (tok->encoding == NULL)
750 goto error_nomem;
751
752 Py_DECREF(enc);
753 return 0;
754
755error_nomem:
756 Py_DECREF(enc);
757 tok->done = E_NOMEM;
758 return -1;
759
760error_clear:
761 /* Fallback to iso-8859-1: for backward compatibility */
762 Py_DECREF(enc);
763 PyErr_Clear();
764 return 0;
765}
766#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767
768/* Get next char, updating state; error code goes into tok->done */
769
770static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000771tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000774 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000775 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000776 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000777 if (tok->done != E_OK)
778 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000780 char *end = strchr(tok->inp, '\n');
781 if (end != NULL)
782 end++;
783 else {
784 end = strchr(tok->inp, '\0');
785 if (end == tok->inp) {
786 tok->done = E_EOF;
787 return EOF;
788 }
789 }
790 if (tok->start == NULL)
791 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000792 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000793 tok->lineno++;
794 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000795 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000796 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000798 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799 if (tok->nextprompt != NULL)
800 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000801 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000802 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000803 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000804 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805 tok->done = E_EOF;
806 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000807#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000808 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000809 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000810#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000812 size_t start = tok->start - tok->buf;
813 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000814 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000815 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000816 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 tok->lineno++;
818 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000819 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000820 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000821 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000822 tok->done = E_NOMEM;
823 return EOF;
824 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000825 tok->buf = buf;
826 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000827 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000828 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000829 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 tok->inp = tok->buf + newlen;
831 tok->end = tok->inp + 1;
832 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000833 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 else {
835 tok->lineno++;
836 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000837 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000838 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000839 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000841 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 tok->inp = strchr(tok->buf, '\0');
843 tok->end = tok->inp + 1;
844 }
845 }
846 else {
847 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000848 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000849 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000850 if (tok->start == NULL) {
851 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000852 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000853 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 if (tok->buf == NULL) {
855 tok->done = E_NOMEM;
856 return EOF;
857 }
858 tok->end = tok->buf + BUFSIZ;
859 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000860 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
861 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 tok->done = E_EOF;
863 done = 1;
864 }
865 else {
866 tok->done = E_OK;
867 tok->inp = strchr(tok->buf, '\0');
868 done = tok->inp[-1] == '\n';
869 }
870 }
871 else {
872 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000873 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000874 tok->done = E_EOF;
875 done = 1;
876 }
877 else
878 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 }
880 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000881 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000882 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883 Py_ssize_t curstart = tok->start == NULL ? -1 :
884 tok->start - tok->buf;
885 Py_ssize_t curvalid = tok->inp - tok->buf;
886 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000888 newbuf = (char *)PyMem_REALLOC(newbuf,
889 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000890 if (newbuf == NULL) {
891 tok->done = E_NOMEM;
892 tok->cur = tok->inp;
893 return EOF;
894 }
895 tok->buf = newbuf;
896 tok->inp = tok->buf + curvalid;
897 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000898 tok->start = curstart < 0 ? NULL :
899 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000900 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000901 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000902 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000903 /* Break out early on decoding
904 errors, as tok->buf will be NULL
905 */
906 if (tok->decoding_erred)
907 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000908 /* Last line does not end in \n,
909 fake one */
910 strcpy(tok->inp, "\n");
911 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000912 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000913 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000914 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000915 if (tok->buf != NULL) {
916 tok->cur = tok->buf + cur;
917 tok->line_start = tok->cur;
918 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000919 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000920 pt = tok->inp - 2;
921 if (pt >= tok->buf && *pt == '\r') {
922 *pt++ = '\n';
923 *pt = '\0';
924 tok->inp = pt;
925 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000926 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000927 }
928 if (tok->done != E_OK) {
929 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000930 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000931 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000932 return EOF;
933 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000934 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000935 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000936}
937
938
939/* Back-up one character */
940
941static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000942tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000943{
944 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000945 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000946 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000947 if (*tok->cur != c)
948 *tok->cur = c;
949 }
950}
951
952
953/* Return the token corresponding to a single character */
954
955int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000956PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957{
958 switch (c) {
959 case '(': return LPAR;
960 case ')': return RPAR;
961 case '[': return LSQB;
962 case ']': return RSQB;
963 case ':': return COLON;
964 case ',': return COMMA;
965 case ';': return SEMI;
966 case '+': return PLUS;
967 case '-': return MINUS;
968 case '*': return STAR;
969 case '/': return SLASH;
970 case '|': return VBAR;
971 case '&': return AMPER;
972 case '<': return LESS;
973 case '>': return GREATER;
974 case '=': return EQUAL;
975 case '.': return DOT;
976 case '%': return PERCENT;
977 case '`': return BACKQUOTE;
978 case '{': return LBRACE;
979 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000980 case '^': return CIRCUMFLEX;
981 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000982 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983 default: return OP;
984 }
985}
986
987
Guido van Rossumfbab9051991-10-20 20:25:03 +0000988int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000989PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000990{
991 switch (c1) {
992 case '=':
993 switch (c2) {
994 case '=': return EQEQUAL;
995 }
996 break;
997 case '!':
998 switch (c2) {
999 case '=': return NOTEQUAL;
1000 }
1001 break;
1002 case '<':
1003 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001004 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001005 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001006 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001007 }
1008 break;
1009 case '>':
1010 switch (c2) {
1011 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001012 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001013 }
1014 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001015 case '+':
1016 switch (c2) {
1017 case '=': return PLUSEQUAL;
1018 }
1019 break;
1020 case '-':
1021 switch (c2) {
1022 case '=': return MINEQUAL;
1023 }
1024 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001025 case '*':
1026 switch (c2) {
1027 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001028 case '=': return STAREQUAL;
1029 }
1030 break;
1031 case '/':
1032 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001033 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001034 case '=': return SLASHEQUAL;
1035 }
1036 break;
1037 case '|':
1038 switch (c2) {
1039 case '=': return VBAREQUAL;
1040 }
1041 break;
1042 case '%':
1043 switch (c2) {
1044 case '=': return PERCENTEQUAL;
1045 }
1046 break;
1047 case '&':
1048 switch (c2) {
1049 case '=': return AMPEREQUAL;
1050 }
1051 break;
1052 case '^':
1053 switch (c2) {
1054 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001055 }
1056 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001057 }
1058 return OP;
1059}
1060
Thomas Wouters434d0822000-08-24 20:11:32 +00001061int
1062PyToken_ThreeChars(int c1, int c2, int c3)
1063{
1064 switch (c1) {
1065 case '<':
1066 switch (c2) {
1067 case '<':
1068 switch (c3) {
1069 case '=':
1070 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001071 }
1072 break;
1073 }
1074 break;
1075 case '>':
1076 switch (c2) {
1077 case '>':
1078 switch (c3) {
1079 case '=':
1080 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001081 }
1082 break;
1083 }
1084 break;
1085 case '*':
1086 switch (c2) {
1087 case '*':
1088 switch (c3) {
1089 case '=':
1090 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001091 }
1092 break;
1093 }
1094 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001095 case '/':
1096 switch (c2) {
1097 case '/':
1098 switch (c3) {
1099 case '=':
1100 return DOUBLESLASHEQUAL;
1101 }
1102 break;
1103 }
1104 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001105 }
1106 return OP;
1107}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001108
Guido van Rossum926f13a1998-04-09 21:38:06 +00001109static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001110indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001111{
1112 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001113 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001114 tok->cur = tok->inp;
1115 return 1;
1116 }
1117 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001118 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1119 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 tok->altwarning = 0;
1121 }
1122 return 0;
1123}
1124
1125
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126/* Get next token, after space stripping etc. */
1127
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001128static int
1129tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130{
1131 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001132 int blankline;
1133
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001134 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001135 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001136 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001137 blankline = 0;
1138
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001139 /* Get indentation level */
1140 if (tok->atbol) {
1141 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001142 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001143 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144 for (;;) {
1145 c = tok_nextc(tok);
1146 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001147 col++, altcol++;
1148 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001150 altcol = (altcol/tok->alttabsize + 1)
1151 * tok->alttabsize;
1152 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001153 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001154 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001155 else
1156 break;
1157 }
1158 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001159 if (c == '#' || c == '\n') {
1160 /* Lines with only whitespace and/or comments
1161 shouldn't affect the indentation and are
1162 not passed to the parser as NEWLINE tokens,
1163 except *totally* empty lines in interactive
1164 mode, which signal the end of a command group. */
1165 if (col == 0 && c == '\n' && tok->prompt != NULL)
1166 blankline = 0; /* Let it through */
1167 else
1168 blankline = 1; /* Ignore completely */
1169 /* We can't jump back right here since we still
1170 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001171 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001172 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001173 if (col == tok->indstack[tok->indent]) {
1174 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001175 if (altcol != tok->altindstack[tok->indent]) {
1176 if (indenterror(tok))
1177 return ERRORTOKEN;
1178 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001180 else if (col > tok->indstack[tok->indent]) {
1181 /* Indent -- always one */
1182 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001183 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001184 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001185 return ERRORTOKEN;
1186 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001187 if (altcol <= tok->altindstack[tok->indent]) {
1188 if (indenterror(tok))
1189 return ERRORTOKEN;
1190 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001191 tok->pendin++;
1192 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001193 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001195 else /* col < tok->indstack[tok->indent] */ {
1196 /* Dedent -- any number, must be consistent */
1197 while (tok->indent > 0 &&
1198 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001199 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001200 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001201 }
1202 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001203 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001204 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205 return ERRORTOKEN;
1206 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001207 if (altcol != tok->altindstack[tok->indent]) {
1208 if (indenterror(tok))
1209 return ERRORTOKEN;
1210 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 }
1212 }
1213 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001214
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001215 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001216
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 /* Return pending indents/dedents */
1218 if (tok->pendin != 0) {
1219 if (tok->pendin < 0) {
1220 tok->pendin++;
1221 return DEDENT;
1222 }
1223 else {
1224 tok->pendin--;
1225 return INDENT;
1226 }
1227 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001228
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001230 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 /* Skip spaces */
1232 do {
1233 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001234 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001235
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001236 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001237 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001238
Guido van Rossumab5ca152000-03-31 00:52:27 +00001239 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001241 static char *tabforms[] = {
1242 "tab-width:", /* Emacs */
1243 ":tabstop=", /* vim, full form */
1244 ":ts=", /* vim, abbreviated form */
1245 "set tabsize=", /* will vi never die? */
1246 /* more templates can be added here to support other editors */
1247 };
1248 char cbuf[80];
1249 char *tp, **cp;
1250 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001252 *tp++ = c = tok_nextc(tok);
1253 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001254 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001255 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001256 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001257 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1258 cp++) {
1259 if ((tp = strstr(cbuf, *cp))) {
1260 int newsize = atoi(tp + strlen(*cp));
1261
1262 if (newsize >= 1 && newsize <= 40) {
1263 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001264 if (Py_VerboseFlag)
1265 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001266 "Tab size set to %d\n",
1267 newsize);
1268 }
1269 }
1270 }
1271 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001274
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001276 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001278 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001279
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 /* Identifier (most frequent token!) */
1281 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001282 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001283 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001284 case 'b':
1285 case 'B':
1286 c = tok_nextc(tok);
1287 if (c == 'r' || c == 'R')
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001292 case 'r':
1293 case 'R':
1294 c = tok_nextc(tok);
1295 if (c == '"' || c == '\'')
1296 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001297 break;
1298 case 'u':
1299 case 'U':
1300 c = tok_nextc(tok);
1301 if (c == 'r' || c == 'R')
1302 c = tok_nextc(tok);
1303 if (c == '"' || c == '\'')
1304 goto letter_quote;
1305 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001306 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001307 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001309 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001311 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001312 *p_end = tok->cur;
1313 return NAME;
1314 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001315
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 /* Newline */
1317 if (c == '\n') {
1318 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001319 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001320 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001321 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001323 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 return NEWLINE;
1325 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001326
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001327 /* Period or number starting with period? */
1328 if (c == '.') {
1329 c = tok_nextc(tok);
1330 if (isdigit(c)) {
1331 goto fraction;
1332 }
1333 else {
1334 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001335 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001336 *p_end = tok->cur;
1337 return DOT;
1338 }
1339 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001340
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 /* Number */
1342 if (isdigit(c)) {
1343 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001344 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 c = tok_nextc(tok);
1346 if (c == '.')
1347 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001348#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001349 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001350 goto imaginary;
1351#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001353
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001355 c = tok_nextc(tok);
1356 if (!isxdigit(c)) {
1357 tok->done = E_TOKEN;
1358 tok_backup(tok, c);
1359 return ERRORTOKEN;
1360 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001361 do {
1362 c = tok_nextc(tok);
1363 } while (isxdigit(c));
1364 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001365 else if (c == 'o' || c == 'O') {
1366 /* Octal */
1367 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001368 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001369 tok->done = E_TOKEN;
1370 tok_backup(tok, c);
1371 return ERRORTOKEN;
1372 }
1373 do {
1374 c = tok_nextc(tok);
1375 } while ('0' <= c && c < '8');
1376 }
1377 else if (c == 'b' || c == 'B') {
1378 /* Binary */
1379 c = tok_nextc(tok);
1380 if (c != '0' && c != '1') {
1381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
1384 }
1385 do {
1386 c = tok_nextc(tok);
1387 } while (c == '0' || c == '1');
1388 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001390 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 /* Octal; c is first char of it */
1392 /* There's no 'isoctdigit' macro, sigh */
1393 while ('0' <= c && c < '8') {
1394 c = tok_nextc(tok);
1395 }
Tim Petersd507dab2001-08-30 20:51:59 +00001396 if (isdigit(c)) {
1397 found_decimal = 1;
1398 do {
1399 c = tok_nextc(tok);
1400 } while (isdigit(c));
1401 }
1402 if (c == '.')
1403 goto fraction;
1404 else if (c == 'e' || c == 'E')
1405 goto exponent;
1406#ifndef WITHOUT_COMPLEX
1407 else if (c == 'j' || c == 'J')
1408 goto imaginary;
1409#endif
1410 else if (found_decimal) {
1411 tok->done = E_TOKEN;
1412 tok_backup(tok, c);
1413 return ERRORTOKEN;
1414 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001416 if (c == 'l' || c == 'L')
1417 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 }
1419 else {
1420 /* Decimal */
1421 do {
1422 c = tok_nextc(tok);
1423 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001424 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001426 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001427 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001428 if (c == '.') {
1429 fraction:
1430 /* Fraction */
1431 do {
1432 c = tok_nextc(tok);
1433 } while (isdigit(c));
1434 }
1435 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001436 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001437 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001439 if (c == '+' || c == '-')
1440 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001441 if (!isdigit(c)) {
1442 tok->done = E_TOKEN;
1443 tok_backup(tok, c);
1444 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001445 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001446 do {
1447 c = tok_nextc(tok);
1448 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001449 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001450#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001451 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001452 /* Imaginary part */
1453 imaginary:
1454 c = tok_nextc(tok);
1455#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 }
1457 }
1458 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001459 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001460 *p_end = tok->cur;
1461 return NUMBER;
1462 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001463
1464 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001465 /* String */
1466 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001468 int quote = c;
1469 int triple = 0;
1470 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001471 for (;;) {
1472 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001473 if (c == '\n') {
1474 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001475 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001476 tok_backup(tok, c);
1477 return ERRORTOKEN;
1478 }
1479 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001480 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001481 }
1482 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001483 if (triple)
1484 tok->done = E_EOFS;
1485 else
1486 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001487 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001488 return ERRORTOKEN;
1489 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001490 else if (c == quote) {
1491 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001492 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001493 c = tok_nextc(tok);
1494 if (c == quote) {
1495 triple = 1;
1496 tripcount = 0;
1497 continue;
1498 }
1499 tok_backup(tok, c);
1500 }
1501 if (!triple || tripcount == 3)
1502 break;
1503 }
1504 else if (c == '\\') {
1505 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001506 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001507 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001508 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001509 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510 return ERRORTOKEN;
1511 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001512 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001513 else
1514 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001515 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001516 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001517 *p_end = tok->cur;
1518 return STRING;
1519 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001520
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 /* Line continuation */
1522 if (c == '\\') {
1523 c = tok_nextc(tok);
1524 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001525 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001526 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001527 return ERRORTOKEN;
1528 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001529 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001530 goto again; /* Read next line */
1531 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001532
Guido van Rossumfbab9051991-10-20 20:25:03 +00001533 /* Check for two-character token */
1534 {
1535 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001536 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001537#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001538 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001539 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001540 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001541 tok->filename, tok->lineno,
1542 NULL, NULL)) {
1543 return ERRORTOKEN;
1544 }
1545 }
1546#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001547 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001548 int c3 = tok_nextc(tok);
1549 int token3 = PyToken_ThreeChars(c, c2, c3);
1550 if (token3 != OP) {
1551 token = token3;
1552 } else {
1553 tok_backup(tok, c3);
1554 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001555 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001556 *p_end = tok->cur;
1557 return token;
1558 }
1559 tok_backup(tok, c2);
1560 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001561
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001562 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001563 switch (c) {
1564 case '(':
1565 case '[':
1566 case '{':
1567 tok->level++;
1568 break;
1569 case ')':
1570 case ']':
1571 case '}':
1572 tok->level--;
1573 break;
1574 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001575
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001576 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001577 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001578 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001579 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001580}
1581
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001582int
1583PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1584{
1585 int result = tok_get(tok, p_start, p_end);
1586 if (tok->decoding_erred) {
1587 result = ERRORTOKEN;
1588 tok->done = E_DECODE;
1589 }
1590 return result;
1591}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001592
Martin v. Löwisa5136192007-09-04 14:19:28 +00001593/* This function is only called from parsetok. However, it cannot live
1594 there, as it must be empty for PGEN, and we can check for PGEN only
1595 in this file. */
1596
Christian Heimes082c9b02008-01-23 14:20:50 +00001597#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001598char*
1599PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1600{
1601 return NULL;
1602}
1603#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001604#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001605static PyObject *
1606dec_utf8(const char *enc, const char *text, size_t len) {
1607 PyObject *ret = NULL;
1608 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1609 if (unicode_text) {
1610 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1611 Py_DECREF(unicode_text);
1612 }
1613 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001614 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001615 }
1616 return ret;
1617}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001618char *
1619PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1620{
1621 char *text = NULL;
1622 if (tok->encoding) {
1623 /* convert source to original encondig */
1624 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1625 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001626 int linelen = PyString_Size(lineobj);
1627 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001628 text = PyObject_MALLOC(linelen + 1);
1629 if (text != NULL && line != NULL) {
1630 if (linelen)
1631 strncpy(text, line, linelen);
1632 text[linelen] = '\0';
1633 }
1634 Py_DECREF(lineobj);
1635
1636 /* adjust error offset */
1637 if (*offset > 1) {
1638 PyObject *offsetobj = dec_utf8(tok->encoding,
1639 tok->buf, *offset-1);
1640 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001641 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001642 Py_DECREF(offsetobj);
1643 }
1644 }
1645
1646 }
1647 }
1648 return text;
1649
1650}
Georg Brandl76b30d12008-01-07 18:41:34 +00001651#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001652#endif
1653
Martin v. Löwisa5136192007-09-04 14:19:28 +00001654
Guido van Rossum408027e1996-12-30 16:17:54 +00001655#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001656
1657void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001658tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001659{
Guido van Rossum86bea461997-04-29 21:03:06 +00001660 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001661 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1662 printf("(%.*s)", (int)(end - start), start);
1663}
1664
1665#endif