blob: 79f1fa814329787885a5d490f5b2f253f146b30f [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000160 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000202get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000204 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
223
224 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000225 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000226 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000227 t++;
228
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000233 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000234 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 int set_readline(struct tok_state *, const char *))
251{
Tim Peters17db21f2002-09-03 15:39:58 +0000252 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000254
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000258 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000267#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000273 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000274 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
Victor Stinner0217c952010-03-21 13:09:24 +0000307 int ch1, ch2, ch3;
308 ch1 = get_char(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309 tok->decoding_state = 1;
Victor Stinner0217c952010-03-21 13:09:24 +0000310 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000311 return 1;
Victor Stinner0217c952010-03-21 13:09:24 +0000312 } else if (ch1 == 0xEF) {
313 ch2 = get_char(tok);
314 if (ch2 != 0xBB) {
315 unget_char(ch2, tok);
316 unget_char(ch1, tok);
317 return 1;
318 }
319 ch3 = get_char(tok);
320 if (ch3 != 0xBF) {
321 unget_char(ch3, tok);
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326#if 0
327 /* Disable support for UTF-16 BOMs until a decision
328 is made whether this needs to be supported. */
Victor Stinner0217c952010-03-21 13:09:24 +0000329 } else if (ch1 == 0xFE) {
330 ch2 = get_char(tok);
331 if (ch2 != 0xFF) {
332 unget_char(ch2, tok);
333 unget_char(ch1, tok);
334 return 1;
335 }
336 if (!set_readline(tok, "utf-16-be"))
337 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 tok->decoding_state = -1;
Victor Stinner0217c952010-03-21 13:09:24 +0000339 } else if (ch1 == 0xFF) {
340 ch2 = get_char(tok);
341 if (ch2 != 0xFE) {
342 unget_char(ch2, tok);
343 unget_char(ch1, tok);
344 return 1;
345 }
346 if (!set_readline(tok, "utf-16-le"))
347 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 tok->decoding_state = -1;
349#endif
350 } else {
Victor Stinner0217c952010-03-21 13:09:24 +0000351 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352 return 1;
353 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000354 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000355 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
357 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358}
359
360/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000362
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 On entry, tok->decoding_buffer will be one of:
364 1) NULL: need to call tok->decoding_readline to get a new line
365 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
366 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000367 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 (in the s buffer) to copy entire contents of the line read
369 by tok->decoding_readline. tok->decoding_buffer has the overflow.
370 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000371 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 reached): see tok_nextc and its calls to decoding_fgets.
373*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374
375static char *
376fp_readl(char *s, int size, struct tok_state *tok)
377{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000378#ifndef Py_USING_UNICODE
379 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000380 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000381 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000382#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000386 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387
388 /* Ask for one less byte so we can terminate it */
389 assert(size > 0);
390 size--;
391
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000394 if (buf == NULL)
395 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000396 } else {
397 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000398 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 if (utf8 == NULL) {
402 utf8 = PyUnicode_AsUTF8String(buf);
403 Py_DECREF(buf);
404 if (utf8 == NULL)
405 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000406 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000407 str = PyString_AsString(utf8);
408 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000410 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411 if (tok->decoding_buffer == NULL) {
412 Py_DECREF(utf8);
413 return error_ret(tok);
414 }
415 utf8len = size;
416 }
417 memcpy(s, str, utf8len);
418 s[utf8len] = '\0';
419 Py_DECREF(utf8);
420 if (utf8len == 0) return NULL; /* EOF */
421 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000422#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423}
424
425/* Set the readline function for TOK to a StreamReader's
426 readline function. The StreamReader is named ENC.
427
428 This function is called from check_bom and check_coding_spec.
429
430 ENC is usually identical to the future value of tok->encoding,
431 except for the (currently unsupported) case of UTF-16.
432
433 Return 1 on success, 0 on failure. */
434
435static int
436fp_setreadl(struct tok_state *tok, const char* enc)
437{
438 PyObject *reader, *stream, *readline;
439
Martin v. Löwis95292d62002-12-11 14:04:59 +0000440 /* XXX: constify filename argument. */
441 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000442 if (stream == NULL)
443 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
445 reader = PyCodec_StreamReader(enc, stream, NULL);
446 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000447 if (reader == NULL)
448 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
450 readline = PyObject_GetAttrString(reader, "readline");
451 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000452 if (readline == NULL)
453 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
455 tok->decoding_readline = readline;
456 return 1;
457}
458
459/* Fetch the next byte from TOK. */
460
461static int fp_getc(struct tok_state *tok) {
462 return getc(tok->fp);
463}
464
465/* Unfetch the last byte back into TOK. */
466
467static void fp_ungetc(int c, struct tok_state *tok) {
468 ungetc(c, tok->fp);
469}
470
471/* Read a line of input from TOK. Determine encoding
472 if necessary. */
473
474static char *
475decoding_fgets(char *s, int size, struct tok_state *tok)
476{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000477 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000478 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000479 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480 if (tok->decoding_state < 0) {
481 /* We already have a codec associated with
482 this input. */
483 line = fp_readl(s, size, tok);
484 break;
485 } else if (tok->decoding_state > 0) {
486 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000487 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000489 break;
490 } else {
491 /* We have not yet determined the encoding.
492 If an encoding is found, use the file-pointer
493 reader functions from now on. */
494 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
495 return error_ret(tok);
496 assert(tok->decoding_state != 0);
497 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000498 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
500 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
501 return error_ret(tok);
502 }
503 }
504#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000505 /* The default encoding is ASCII, so make sure we don't have any
506 non-ASCII bytes in it. */
507 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000508 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000509 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510 if (*c > 127) {
511 badchar = *c;
512 break;
513 }
514 }
515 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000516 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000517 /* Need to add 1 to the line number, since this line
518 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000519 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000520 "Non-ASCII character '\\x%.2x' "
521 "in file %.200s on line %i, "
522 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000523 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000524 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000525 PyErr_SetString(PyExc_SyntaxError, buf);
526 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 }
528#endif
529 return line;
530}
531
532static int
533decoding_feof(struct tok_state *tok)
534{
535 if (tok->decoding_state >= 0) {
536 return feof(tok->fp);
537 } else {
538 PyObject* buf = tok->decoding_buffer;
539 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000540 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541 if (buf == NULL) {
542 error_ret(tok);
543 return 1;
544 } else {
545 tok->decoding_buffer = buf;
546 }
547 }
548 return PyObject_Length(buf) == 0;
549 }
550}
551
552/* Fetch a byte from TOK, using the string buffer. */
553
Tim Petersc9d78aa2006-03-26 23:27:58 +0000554static int
555buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000556 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557}
558
559/* Unfetch a byte from TOK, using the string buffer. */
560
Tim Petersc9d78aa2006-03-26 23:27:58 +0000561static void
562buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000564 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565}
566
567/* Set the readline function for TOK to ENC. For the string-based
568 tokenizer, this means to just record the encoding. */
569
Tim Petersc9d78aa2006-03-26 23:27:58 +0000570static int
571buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572 tok->enc = enc;
573 return 1;
574}
575
576/* Return a UTF-8 encoding Python string object from the
577 C byte string STR, which is encoded with ENC. */
578
Martin v. Löwis019934b2002-08-07 12:33:18 +0000579#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580static PyObject *
581translate_into_utf8(const char* str, const char* enc) {
582 PyObject *utf8;
583 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
584 if (buf == NULL)
585 return NULL;
586 utf8 = PyUnicode_AsUTF8String(buf);
587 Py_DECREF(buf);
588 return utf8;
589}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000590#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591
592/* Decode a byte string STR for use as the buffer of TOK.
593 Look for encoding declarations inside STR, and record them
594 inside TOK. */
595
596static const char *
597decode_str(const char *str, struct tok_state *tok)
598{
599 PyObject* utf8 = NULL;
600 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000601 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 int lineno = 0;
603 tok->enc = NULL;
604 tok->str = str;
605 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000606 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000608 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000609#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000610 if (tok->enc != NULL) {
611 utf8 = translate_into_utf8(str, tok->enc);
612 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000613 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000614 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000616#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000617 for (s = str;; s++) {
618 if (*s == '\0') break;
619 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000620 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000621 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 lineno++;
623 if (lineno == 2) break;
624 }
625 }
626 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000627 /* need to check line 1 and 2 separately since check_coding_spec
628 assumes a single line as input */
629 if (newl[0]) {
630 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
631 return error_ret(tok);
632 if (tok->enc == NULL && newl[1]) {
633 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
634 tok, buf_setreadl))
635 return error_ret(tok);
636 }
637 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000638#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 if (tok->enc != NULL) {
640 assert(utf8 == NULL);
641 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000642 if (utf8 == NULL) {
643 PyErr_Format(PyExc_SyntaxError,
644 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000645 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000646 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000647 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000649#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 assert(tok->decoding_buffer == NULL);
651 tok->decoding_buffer = utf8; /* CAUTION */
652 return str;
653}
654
655#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656
657/* Set up tokenizer for string */
658
659struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000660PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661{
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 if (str == NULL) {
667 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000669 }
670
Martin v. Löwis95292d62002-12-11 14:04:59 +0000671 /* XXX: constify members. */
672 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 return tok;
674}
675
676
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000677/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678
679struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000680PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681{
682 struct tok_state *tok = tok_new();
683 if (tok == NULL)
684 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000685 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000686 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687 return NULL;
688 }
689 tok->cur = tok->inp = tok->buf;
690 tok->end = tok->buf + BUFSIZ;
691 tok->fp = fp;
692 tok->prompt = ps1;
693 tok->nextprompt = ps2;
694 return tok;
695}
696
697
698/* Free a tok_state structure */
699
700void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000701PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000704 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000705#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000706 Py_XDECREF(tok->decoding_readline);
707 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000708#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000709 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000710 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000711 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712}
713
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000714#if !defined(PGEN) && defined(Py_USING_UNICODE)
715static int
716tok_stdin_decode(struct tok_state *tok, char **inp)
717{
718 PyObject *enc, *sysstdin, *decoded, *utf8;
719 const char *encoding;
720 char *converted;
721
722 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
723 return 0;
724 sysstdin = PySys_GetObject("stdin");
725 if (sysstdin == NULL || !PyFile_Check(sysstdin))
726 return 0;
727
728 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000729 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000730 return 0;
731 Py_INCREF(enc);
732
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000733 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000734 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
735 if (decoded == NULL)
736 goto error_clear;
737
738 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
739 Py_DECREF(decoded);
740 if (utf8 == NULL)
741 goto error_clear;
742
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000743 assert(PyString_Check(utf8));
744 converted = new_string(PyString_AS_STRING(utf8),
745 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000746 Py_DECREF(utf8);
747 if (converted == NULL)
748 goto error_nomem;
749
Neal Norwitz08062d62006-04-11 08:19:15 +0000750 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000751 *inp = converted;
752 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000753 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000754 tok->encoding = new_string(encoding, strlen(encoding));
755 if (tok->encoding == NULL)
756 goto error_nomem;
757
758 Py_DECREF(enc);
759 return 0;
760
761error_nomem:
762 Py_DECREF(enc);
763 tok->done = E_NOMEM;
764 return -1;
765
766error_clear:
767 /* Fallback to iso-8859-1: for backward compatibility */
768 Py_DECREF(enc);
769 PyErr_Clear();
770 return 0;
771}
772#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773
774/* Get next char, updating state; error code goes into tok->done */
775
776static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000777tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000780 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000781 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000782 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000783 if (tok->done != E_OK)
784 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000786 char *end = strchr(tok->inp, '\n');
787 if (end != NULL)
788 end++;
789 else {
790 end = strchr(tok->inp, '\0');
791 if (end == tok->inp) {
792 tok->done = E_EOF;
793 return EOF;
794 }
795 }
796 if (tok->start == NULL)
797 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000798 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000799 tok->lineno++;
800 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000801 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000804 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805 if (tok->nextprompt != NULL)
806 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000807 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000809 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000810 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 tok->done = E_EOF;
812 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000813#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000814 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000815 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000816#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000818 size_t start = tok->start - tok->buf;
819 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000820 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000821 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000822 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 tok->lineno++;
824 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000825 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000826 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000827 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000828 tok->done = E_NOMEM;
829 return EOF;
830 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 tok->buf = buf;
832 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000833 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000834 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000835 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 tok->inp = tok->buf + newlen;
837 tok->end = tok->inp + 1;
838 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000839 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 else {
841 tok->lineno++;
842 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000843 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000844 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000845 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000847 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 tok->inp = strchr(tok->buf, '\0');
849 tok->end = tok->inp + 1;
850 }
851 }
852 else {
853 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000854 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000855 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 if (tok->start == NULL) {
857 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000858 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000859 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 if (tok->buf == NULL) {
861 tok->done = E_NOMEM;
862 return EOF;
863 }
864 tok->end = tok->buf + BUFSIZ;
865 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000866 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
867 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000868 tok->done = E_EOF;
869 done = 1;
870 }
871 else {
872 tok->done = E_OK;
873 tok->inp = strchr(tok->buf, '\0');
874 done = tok->inp[-1] == '\n';
875 }
876 }
877 else {
878 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000879 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000880 tok->done = E_EOF;
881 done = 1;
882 }
883 else
884 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000885 }
886 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000889 Py_ssize_t curstart = tok->start == NULL ? -1 :
890 tok->start - tok->buf;
891 Py_ssize_t curvalid = tok->inp - tok->buf;
892 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000894 newbuf = (char *)PyMem_REALLOC(newbuf,
895 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 if (newbuf == NULL) {
897 tok->done = E_NOMEM;
898 tok->cur = tok->inp;
899 return EOF;
900 }
901 tok->buf = newbuf;
902 tok->inp = tok->buf + curvalid;
903 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 tok->start = curstart < 0 ? NULL :
905 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000906 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000907 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000908 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000909 /* Break out early on decoding
910 errors, as tok->buf will be NULL
911 */
912 if (tok->decoding_erred)
913 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000914 /* Last line does not end in \n,
915 fake one */
916 strcpy(tok->inp, "\n");
917 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000918 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000919 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000920 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000921 if (tok->buf != NULL) {
922 tok->cur = tok->buf + cur;
923 tok->line_start = tok->cur;
924 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000925 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000926 pt = tok->inp - 2;
927 if (pt >= tok->buf && *pt == '\r') {
928 *pt++ = '\n';
929 *pt = '\0';
930 tok->inp = pt;
931 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000932 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000933 }
934 if (tok->done != E_OK) {
935 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000936 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000937 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000938 return EOF;
939 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000940 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942}
943
944
945/* Back-up one character */
946
947static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000948tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949{
950 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000951 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000952 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953 if (*tok->cur != c)
954 *tok->cur = c;
955 }
956}
957
958
959/* Return the token corresponding to a single character */
960
961int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000962PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000963{
964 switch (c) {
965 case '(': return LPAR;
966 case ')': return RPAR;
967 case '[': return LSQB;
968 case ']': return RSQB;
969 case ':': return COLON;
970 case ',': return COMMA;
971 case ';': return SEMI;
972 case '+': return PLUS;
973 case '-': return MINUS;
974 case '*': return STAR;
975 case '/': return SLASH;
976 case '|': return VBAR;
977 case '&': return AMPER;
978 case '<': return LESS;
979 case '>': return GREATER;
980 case '=': return EQUAL;
981 case '.': return DOT;
982 case '%': return PERCENT;
983 case '`': return BACKQUOTE;
984 case '{': return LBRACE;
985 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000986 case '^': return CIRCUMFLEX;
987 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000988 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000989 default: return OP;
990 }
991}
992
993
Guido van Rossumfbab9051991-10-20 20:25:03 +0000994int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000995PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000996{
997 switch (c1) {
998 case '=':
999 switch (c2) {
1000 case '=': return EQEQUAL;
1001 }
1002 break;
1003 case '!':
1004 switch (c2) {
1005 case '=': return NOTEQUAL;
1006 }
1007 break;
1008 case '<':
1009 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001010 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001011 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001012 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001013 }
1014 break;
1015 case '>':
1016 switch (c2) {
1017 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001018 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001019 }
1020 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001021 case '+':
1022 switch (c2) {
1023 case '=': return PLUSEQUAL;
1024 }
1025 break;
1026 case '-':
1027 switch (c2) {
1028 case '=': return MINEQUAL;
1029 }
1030 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001031 case '*':
1032 switch (c2) {
1033 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001034 case '=': return STAREQUAL;
1035 }
1036 break;
1037 case '/':
1038 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001039 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001040 case '=': return SLASHEQUAL;
1041 }
1042 break;
1043 case '|':
1044 switch (c2) {
1045 case '=': return VBAREQUAL;
1046 }
1047 break;
1048 case '%':
1049 switch (c2) {
1050 case '=': return PERCENTEQUAL;
1051 }
1052 break;
1053 case '&':
1054 switch (c2) {
1055 case '=': return AMPEREQUAL;
1056 }
1057 break;
1058 case '^':
1059 switch (c2) {
1060 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001061 }
1062 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001063 }
1064 return OP;
1065}
1066
Thomas Wouters434d0822000-08-24 20:11:32 +00001067int
1068PyToken_ThreeChars(int c1, int c2, int c3)
1069{
1070 switch (c1) {
1071 case '<':
1072 switch (c2) {
1073 case '<':
1074 switch (c3) {
1075 case '=':
1076 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 }
1078 break;
1079 }
1080 break;
1081 case '>':
1082 switch (c2) {
1083 case '>':
1084 switch (c3) {
1085 case '=':
1086 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001087 }
1088 break;
1089 }
1090 break;
1091 case '*':
1092 switch (c2) {
1093 case '*':
1094 switch (c3) {
1095 case '=':
1096 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001097 }
1098 break;
1099 }
1100 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001101 case '/':
1102 switch (c2) {
1103 case '/':
1104 switch (c3) {
1105 case '=':
1106 return DOUBLESLASHEQUAL;
1107 }
1108 break;
1109 }
1110 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001111 }
1112 return OP;
1113}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001114
Guido van Rossum926f13a1998-04-09 21:38:06 +00001115static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001116indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001117{
1118 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001119 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 tok->cur = tok->inp;
1121 return 1;
1122 }
1123 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001124 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1125 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001126 tok->altwarning = 0;
1127 }
1128 return 0;
1129}
1130
1131
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001132/* Get next token, after space stripping etc. */
1133
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001134static int
1135tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136{
1137 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001138 int blankline;
1139
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001140 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001141 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001142 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 blankline = 0;
1144
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001145 /* Get indentation level */
1146 if (tok->atbol) {
1147 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001148 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001150 for (;;) {
1151 c = tok_nextc(tok);
1152 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001153 col++, altcol++;
1154 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001155 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001156 altcol = (altcol/tok->alttabsize + 1)
1157 * tok->alttabsize;
1158 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001159 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001161 else
1162 break;
1163 }
1164 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001165 if (c == '#' || c == '\n') {
1166 /* Lines with only whitespace and/or comments
1167 shouldn't affect the indentation and are
1168 not passed to the parser as NEWLINE tokens,
1169 except *totally* empty lines in interactive
1170 mode, which signal the end of a command group. */
1171 if (col == 0 && c == '\n' && tok->prompt != NULL)
1172 blankline = 0; /* Let it through */
1173 else
1174 blankline = 1; /* Ignore completely */
1175 /* We can't jump back right here since we still
1176 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001178 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 if (col == tok->indstack[tok->indent]) {
1180 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001181 if (altcol != tok->altindstack[tok->indent]) {
1182 if (indenterror(tok))
1183 return ERRORTOKEN;
1184 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001186 else if (col > tok->indstack[tok->indent]) {
1187 /* Indent -- always one */
1188 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001189 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001190 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001191 return ERRORTOKEN;
1192 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001193 if (altcol <= tok->altindstack[tok->indent]) {
1194 if (indenterror(tok))
1195 return ERRORTOKEN;
1196 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001197 tok->pendin++;
1198 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001199 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001201 else /* col < tok->indstack[tok->indent] */ {
1202 /* Dedent -- any number, must be consistent */
1203 while (tok->indent > 0 &&
1204 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001206 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001207 }
1208 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001209 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001210 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001211 return ERRORTOKEN;
1212 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001213 if (altcol != tok->altindstack[tok->indent]) {
1214 if (indenterror(tok))
1215 return ERRORTOKEN;
1216 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 }
1218 }
1219 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001220
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001221 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001222
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 /* Return pending indents/dedents */
1224 if (tok->pendin != 0) {
1225 if (tok->pendin < 0) {
1226 tok->pendin++;
1227 return DEDENT;
1228 }
1229 else {
1230 tok->pendin--;
1231 return INDENT;
1232 }
1233 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001234
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001236 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 /* Skip spaces */
1238 do {
1239 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001240 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001241
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001242 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001243 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001244
Guido van Rossumab5ca152000-03-31 00:52:27 +00001245 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001247 static char *tabforms[] = {
1248 "tab-width:", /* Emacs */
1249 ":tabstop=", /* vim, full form */
1250 ":ts=", /* vim, abbreviated form */
1251 "set tabsize=", /* will vi never die? */
1252 /* more templates can be added here to support other editors */
1253 };
1254 char cbuf[80];
1255 char *tp, **cp;
1256 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001258 *tp++ = c = tok_nextc(tok);
1259 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001260 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001261 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001262 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001263 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1264 cp++) {
1265 if ((tp = strstr(cbuf, *cp))) {
1266 int newsize = atoi(tp + strlen(*cp));
1267
1268 if (newsize >= 1 && newsize <= 40) {
1269 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001270 if (Py_VerboseFlag)
1271 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001272 "Tab size set to %d\n",
1273 newsize);
1274 }
1275 }
1276 }
1277 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001280
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001282 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001284 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001285
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 /* Identifier (most frequent token!) */
1287 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001288 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001289 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001290 case 'b':
1291 case 'B':
1292 c = tok_nextc(tok);
1293 if (c == 'r' || c == 'R')
1294 c = tok_nextc(tok);
1295 if (c == '"' || c == '\'')
1296 goto letter_quote;
1297 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001298 case 'r':
1299 case 'R':
1300 c = tok_nextc(tok);
1301 if (c == '"' || c == '\'')
1302 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001303 break;
1304 case 'u':
1305 case 'U':
1306 c = tok_nextc(tok);
1307 if (c == 'r' || c == 'R')
1308 c = tok_nextc(tok);
1309 if (c == '"' || c == '\'')
1310 goto letter_quote;
1311 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001312 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001313 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001315 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001317 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 *p_end = tok->cur;
1319 return NAME;
1320 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001321
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 /* Newline */
1323 if (c == '\n') {
1324 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001325 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001326 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001327 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001329 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001330 return NEWLINE;
1331 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001332
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001333 /* Period or number starting with period? */
1334 if (c == '.') {
1335 c = tok_nextc(tok);
1336 if (isdigit(c)) {
1337 goto fraction;
1338 }
1339 else {
1340 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001341 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001342 *p_end = tok->cur;
1343 return DOT;
1344 }
1345 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001346
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 /* Number */
1348 if (isdigit(c)) {
1349 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001350 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 c = tok_nextc(tok);
1352 if (c == '.')
1353 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001354#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001355 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001356 goto imaginary;
1357#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001359
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001361 c = tok_nextc(tok);
1362 if (!isxdigit(c)) {
1363 tok->done = E_TOKEN;
1364 tok_backup(tok, c);
1365 return ERRORTOKEN;
1366 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 do {
1368 c = tok_nextc(tok);
1369 } while (isxdigit(c));
1370 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001371 else if (c == 'o' || c == 'O') {
1372 /* Octal */
1373 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001374 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001375 tok->done = E_TOKEN;
1376 tok_backup(tok, c);
1377 return ERRORTOKEN;
1378 }
1379 do {
1380 c = tok_nextc(tok);
1381 } while ('0' <= c && c < '8');
1382 }
1383 else if (c == 'b' || c == 'B') {
1384 /* Binary */
1385 c = tok_nextc(tok);
1386 if (c != '0' && c != '1') {
1387 tok->done = E_TOKEN;
1388 tok_backup(tok, c);
1389 return ERRORTOKEN;
1390 }
1391 do {
1392 c = tok_nextc(tok);
1393 } while (c == '0' || c == '1');
1394 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001396 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 /* Octal; c is first char of it */
1398 /* There's no 'isoctdigit' macro, sigh */
1399 while ('0' <= c && c < '8') {
1400 c = tok_nextc(tok);
1401 }
Tim Petersd507dab2001-08-30 20:51:59 +00001402 if (isdigit(c)) {
1403 found_decimal = 1;
1404 do {
1405 c = tok_nextc(tok);
1406 } while (isdigit(c));
1407 }
1408 if (c == '.')
1409 goto fraction;
1410 else if (c == 'e' || c == 'E')
1411 goto exponent;
1412#ifndef WITHOUT_COMPLEX
1413 else if (c == 'j' || c == 'J')
1414 goto imaginary;
1415#endif
1416 else if (found_decimal) {
1417 tok->done = E_TOKEN;
1418 tok_backup(tok, c);
1419 return ERRORTOKEN;
1420 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001422 if (c == 'l' || c == 'L')
1423 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 }
1425 else {
1426 /* Decimal */
1427 do {
1428 c = tok_nextc(tok);
1429 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001430 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001431 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001432 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001433 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001434 if (c == '.') {
1435 fraction:
1436 /* Fraction */
1437 do {
1438 c = tok_nextc(tok);
1439 } while (isdigit(c));
1440 }
1441 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001442 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001443 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001445 if (c == '+' || c == '-')
1446 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001447 if (!isdigit(c)) {
1448 tok->done = E_TOKEN;
1449 tok_backup(tok, c);
1450 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001451 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001452 do {
1453 c = tok_nextc(tok);
1454 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001456#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001457 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001458 /* Imaginary part */
1459 imaginary:
1460 c = tok_nextc(tok);
1461#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 }
1463 }
1464 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001465 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 *p_end = tok->cur;
1467 return NUMBER;
1468 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001469
1470 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001471 /* String */
1472 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001473 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001474 int quote = c;
1475 int triple = 0;
1476 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477 for (;;) {
1478 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001479 if (c == '\n') {
1480 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001481 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001482 tok_backup(tok, c);
1483 return ERRORTOKEN;
1484 }
1485 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001486 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001487 }
1488 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001489 if (triple)
1490 tok->done = E_EOFS;
1491 else
1492 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001493 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001494 return ERRORTOKEN;
1495 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001496 else if (c == quote) {
1497 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001498 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001499 c = tok_nextc(tok);
1500 if (c == quote) {
1501 triple = 1;
1502 tripcount = 0;
1503 continue;
1504 }
1505 tok_backup(tok, c);
1506 }
1507 if (!triple || tripcount == 3)
1508 break;
1509 }
1510 else if (c == '\\') {
1511 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001512 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001513 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001514 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001515 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516 return ERRORTOKEN;
1517 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001518 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001519 else
1520 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001522 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001523 *p_end = tok->cur;
1524 return STRING;
1525 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001526
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001527 /* Line continuation */
1528 if (c == '\\') {
1529 c = tok_nextc(tok);
1530 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001531 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001532 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001533 return ERRORTOKEN;
1534 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001535 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001536 goto again; /* Read next line */
1537 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001538
Guido van Rossumfbab9051991-10-20 20:25:03 +00001539 /* Check for two-character token */
1540 {
1541 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001542 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001543#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001544 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001545 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001546 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001547 tok->filename, tok->lineno,
1548 NULL, NULL)) {
1549 return ERRORTOKEN;
1550 }
1551 }
1552#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001553 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001554 int c3 = tok_nextc(tok);
1555 int token3 = PyToken_ThreeChars(c, c2, c3);
1556 if (token3 != OP) {
1557 token = token3;
1558 } else {
1559 tok_backup(tok, c3);
1560 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001561 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001562 *p_end = tok->cur;
1563 return token;
1564 }
1565 tok_backup(tok, c2);
1566 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001567
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001568 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001569 switch (c) {
1570 case '(':
1571 case '[':
1572 case '{':
1573 tok->level++;
1574 break;
1575 case ')':
1576 case ']':
1577 case '}':
1578 tok->level--;
1579 break;
1580 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001581
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001582 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001583 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001584 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001585 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001586}
1587
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001588int
1589PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1590{
1591 int result = tok_get(tok, p_start, p_end);
1592 if (tok->decoding_erred) {
1593 result = ERRORTOKEN;
1594 tok->done = E_DECODE;
1595 }
1596 return result;
1597}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001598
Martin v. Löwisa5136192007-09-04 14:19:28 +00001599/* This function is only called from parsetok. However, it cannot live
1600 there, as it must be empty for PGEN, and we can check for PGEN only
1601 in this file. */
1602
Christian Heimes082c9b02008-01-23 14:20:50 +00001603#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001604char*
1605PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1606{
1607 return NULL;
1608}
1609#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001610#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001611static PyObject *
1612dec_utf8(const char *enc, const char *text, size_t len) {
1613 PyObject *ret = NULL;
1614 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1615 if (unicode_text) {
1616 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1617 Py_DECREF(unicode_text);
1618 }
1619 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001620 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001621 }
1622 return ret;
1623}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001624char *
1625PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1626{
1627 char *text = NULL;
1628 if (tok->encoding) {
1629 /* convert source to original encondig */
1630 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1631 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001632 int linelen = PyString_Size(lineobj);
1633 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001634 text = PyObject_MALLOC(linelen + 1);
1635 if (text != NULL && line != NULL) {
1636 if (linelen)
1637 strncpy(text, line, linelen);
1638 text[linelen] = '\0';
1639 }
1640 Py_DECREF(lineobj);
1641
1642 /* adjust error offset */
1643 if (*offset > 1) {
1644 PyObject *offsetobj = dec_utf8(tok->encoding,
1645 tok->buf, *offset-1);
1646 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001647 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001648 Py_DECREF(offsetobj);
1649 }
1650 }
1651
1652 }
1653 }
1654 return text;
1655
1656}
Georg Brandl76b30d12008-01-07 18:41:34 +00001657#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001658#endif
1659
Martin v. Löwisa5136192007-09-04 14:19:28 +00001660
Guido van Rossum408027e1996-12-30 16:17:54 +00001661#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001662
1663void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001664tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001665{
Guido van Rossum86bea461997-04-29 21:03:06 +00001666 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001667 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1668 printf("(%.*s)", (int)(end - start), start);
1669}
1670
1671#endif