blob: f5c18e0acc6f6ea6565c77cc592b2556ea91f4b1 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Anthony Baxter11490022006-04-11 05:39:14 +0000108 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
109 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110 if (tok == NULL)
111 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000112 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000113 tok->done = E_OK;
114 tok->fp = NULL;
115 tok->tabsize = TABSIZE;
116 tok->indent = 0;
117 tok->indstack[0] = 0;
118 tok->atbol = 1;
119 tok->pendin = 0;
120 tok->prompt = tok->nextprompt = NULL;
121 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000122 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000123 tok->filename = NULL;
124 tok->altwarning = 0;
125 tok->alterror = 0;
126 tok->alttabsize = 1;
127 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000128 tok->decoding_state = 0;
129 tok->decoding_erred = 0;
130 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000167 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000173new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174{
Neal Norwitz08062d62006-04-11 08:19:15 +0000175 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000211 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000232 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000240 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000281 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000287 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000301 return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313{
314 int ch = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch == EOF) {
317 return 1;
318 } else if (ch == 0xEF) {
319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321#if 0
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch == 0xFE) {
325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326 if (!set_readline(tok, "utf-16-be")) return 0;
327 tok->decoding_state = -1;
328 } else if (ch == 0xFF) {
329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-le")) return 0;
331 tok->decoding_state = -1;
332#endif
333 } else {
334 unget_char(ch, tok);
335 return 1;
336 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000337 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000338 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340 return 1;
341 NON_BOM:
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344 return 1;
345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000349
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000358 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 reached): see tok_nextc and its calls to decoding_fgets.
360*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000365#ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000367 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000368 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000369#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000373 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374
375 /* Ask for one less byte so we can terminate it */
376 assert(size > 0);
377 size--;
378
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000381 if (buf == NULL)
382 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383 } else {
384 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 if (PyString_CheckExact(buf))
386 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 if (utf8 == NULL) {
389 utf8 = PyUnicode_AsUTF8String(buf);
390 Py_DECREF(buf);
391 if (utf8 == NULL)
392 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000393 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 str = PyString_AsString(utf8);
395 utf8len = PyString_GET_SIZE(utf8);
396 if (utf8len > size) {
397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398 if (tok->decoding_buffer == NULL) {
399 Py_DECREF(utf8);
400 return error_ret(tok);
401 }
402 utf8len = size;
403 }
404 memcpy(s, str, utf8len);
405 s[utf8len] = '\0';
406 Py_DECREF(utf8);
407 if (utf8len == 0) return NULL; /* EOF */
408 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000409#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410}
411
412/* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
414
415 This function is called from check_bom and check_coding_spec.
416
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
419
420 Return 1 on success, 0 on failure. */
421
422static int
423fp_setreadl(struct tok_state *tok, const char* enc)
424{
425 PyObject *reader, *stream, *readline;
426
Martin v. Löwis95292d62002-12-11 14:04:59 +0000427 /* XXX: constify filename argument. */
428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 if (stream == NULL)
430 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000431
432 reader = PyCodec_StreamReader(enc, stream, NULL);
433 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000434 if (reader == NULL)
435 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
437 readline = PyObject_GetAttrString(reader, "readline");
438 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000439 if (readline == NULL)
440 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441
442 tok->decoding_readline = readline;
443 return 1;
444}
445
446/* Fetch the next byte from TOK. */
447
448static int fp_getc(struct tok_state *tok) {
449 return getc(tok->fp);
450}
451
452/* Unfetch the last byte back into TOK. */
453
454static void fp_ungetc(int c, struct tok_state *tok) {
455 ungetc(c, tok->fp);
456}
457
458/* Read a line of input from TOK. Determine encoding
459 if necessary. */
460
461static char *
462decoding_fgets(char *s, int size, struct tok_state *tok)
463{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000464 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000465 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000466 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467 if (tok->decoding_state < 0) {
468 /* We already have a codec associated with
469 this input. */
470 line = fp_readl(s, size, tok);
471 break;
472 } else if (tok->decoding_state > 0) {
473 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000474 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 break;
477 } else {
478 /* We have not yet determined the encoding.
479 If an encoding is found, use the file-pointer
480 reader functions from now on. */
481 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482 return error_ret(tok);
483 assert(tok->decoding_state != 0);
484 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000485 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488 return error_ret(tok);
489 }
490 }
491#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000492 /* The default encoding is ASCII, so make sure we don't have any
493 non-ASCII bytes in it. */
494 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000496 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000497 if (*c > 127) {
498 badchar = *c;
499 break;
500 }
501 }
502 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000503 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000504 /* Need to add 1 to the line number, since this line
505 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000506 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000507 "Non-ASCII character '\\x%.2x' "
508 "in file %.200s on line %i, "
509 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000510 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000511 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000512 PyErr_SetString(PyExc_SyntaxError, buf);
513 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514 }
515#endif
516 return line;
517}
518
519static int
520decoding_feof(struct tok_state *tok)
521{
522 if (tok->decoding_state >= 0) {
523 return feof(tok->fp);
524 } else {
525 PyObject* buf = tok->decoding_buffer;
526 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000527 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 if (buf == NULL) {
529 error_ret(tok);
530 return 1;
531 } else {
532 tok->decoding_buffer = buf;
533 }
534 }
535 return PyObject_Length(buf) == 0;
536 }
537}
538
539/* Fetch a byte from TOK, using the string buffer. */
540
Tim Petersc9d78aa2006-03-26 23:27:58 +0000541static int
542buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000543 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544}
545
546/* Unfetch a byte from TOK, using the string buffer. */
547
Tim Petersc9d78aa2006-03-26 23:27:58 +0000548static void
549buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000551 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
554/* Set the readline function for TOK to ENC. For the string-based
555 tokenizer, this means to just record the encoding. */
556
Tim Petersc9d78aa2006-03-26 23:27:58 +0000557static int
558buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 tok->enc = enc;
560 return 1;
561}
562
563/* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
565
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567static PyObject *
568translate_into_utf8(const char* str, const char* enc) {
569 PyObject *utf8;
570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571 if (buf == NULL)
572 return NULL;
573 utf8 = PyUnicode_AsUTF8String(buf);
574 Py_DECREF(buf);
575 return utf8;
576}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000577#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578
579/* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
581 inside TOK. */
582
583static const char *
584decode_str(const char *str, struct tok_state *tok)
585{
586 PyObject* utf8 = NULL;
587 const char *s;
Guido van Rossum1c4282b2008-01-25 06:11:53 +0000588 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 int lineno = 0;
590 tok->enc = NULL;
591 tok->str = str;
592 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000595 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 if (tok->enc != NULL) {
598 utf8 = translate_into_utf8(str, tok->enc);
599 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000600 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 str = PyString_AsString(utf8);
602 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000603#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 for (s = str;; s++) {
605 if (*s == '\0') break;
606 else if (*s == '\n') {
Georg Brandl7bdff2c2008-01-21 18:35:52 +0000607 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 lineno++;
609 if (lineno == 2) break;
610 }
611 }
612 tok->enc = NULL;
Georg Brandl7bdff2c2008-01-21 18:35:52 +0000613 /* need to check line 1 and 2 separately since check_coding_spec
614 assumes a single line as input */
615 if (newl[0]) {
616 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
617 return error_ret(tok);
618 if (tok->enc == NULL && newl[1]) {
619 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
620 tok, buf_setreadl))
621 return error_ret(tok);
622 }
623 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000624#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 if (tok->enc != NULL) {
626 assert(utf8 == NULL);
627 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000628 if (utf8 == NULL) {
629 PyErr_Format(PyExc_SyntaxError,
630 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000631 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000632 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000633 str = PyString_AsString(utf8);
634 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000635#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 assert(tok->decoding_buffer == NULL);
637 tok->decoding_buffer = utf8; /* CAUTION */
638 return str;
639}
640
641#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642
643/* Set up tokenizer for string */
644
645struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000646PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647{
648 struct tok_state *tok = tok_new();
649 if (tok == NULL)
650 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000652 if (str == NULL) {
653 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000654 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000655 }
656
Martin v. Löwis95292d62002-12-11 14:04:59 +0000657 /* XXX: constify members. */
658 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659 return tok;
660}
661
662
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000663/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000664
665struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000666PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667{
668 struct tok_state *tok = tok_new();
669 if (tok == NULL)
670 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000671 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000672 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 return NULL;
674 }
675 tok->cur = tok->inp = tok->buf;
676 tok->end = tok->buf + BUFSIZ;
677 tok->fp = fp;
678 tok->prompt = ps1;
679 tok->nextprompt = ps2;
680 return tok;
681}
682
683
684/* Free a tok_state structure */
685
686void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000687PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000688{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000689 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000690 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000691#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000692 Py_XDECREF(tok->decoding_readline);
693 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000694#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000695 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000696 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000697 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000698}
699
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000700#if !defined(PGEN) && defined(Py_USING_UNICODE)
701static int
702tok_stdin_decode(struct tok_state *tok, char **inp)
703{
704 PyObject *enc, *sysstdin, *decoded, *utf8;
705 const char *encoding;
706 char *converted;
707
708 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
709 return 0;
710 sysstdin = PySys_GetObject("stdin");
711 if (sysstdin == NULL || !PyFile_Check(sysstdin))
712 return 0;
713
714 enc = ((PyFileObject *)sysstdin)->f_encoding;
715 if (enc == NULL || !PyString_Check(enc))
716 return 0;
717 Py_INCREF(enc);
718
719 encoding = PyString_AsString(enc);
720 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
721 if (decoded == NULL)
722 goto error_clear;
723
724 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
725 Py_DECREF(decoded);
726 if (utf8 == NULL)
727 goto error_clear;
728
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000729 assert(PyString_Check(utf8));
730 converted = new_string(PyString_AS_STRING(utf8),
731 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000732 Py_DECREF(utf8);
733 if (converted == NULL)
734 goto error_nomem;
735
Neal Norwitz08062d62006-04-11 08:19:15 +0000736 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000737 *inp = converted;
738 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000739 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000740 tok->encoding = new_string(encoding, strlen(encoding));
741 if (tok->encoding == NULL)
742 goto error_nomem;
743
744 Py_DECREF(enc);
745 return 0;
746
747error_nomem:
748 Py_DECREF(enc);
749 tok->done = E_NOMEM;
750 return -1;
751
752error_clear:
753 /* Fallback to iso-8859-1: for backward compatibility */
754 Py_DECREF(enc);
755 PyErr_Clear();
756 return 0;
757}
758#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759
760/* Get next char, updating state; error code goes into tok->done */
761
762static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000763tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000766 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000767 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000768 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000769 if (tok->done != E_OK)
770 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000771 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000772 char *end = strchr(tok->inp, '\n');
773 if (end != NULL)
774 end++;
775 else {
776 end = strchr(tok->inp, '\0');
777 if (end == tok->inp) {
778 tok->done = E_EOF;
779 return EOF;
780 }
781 }
782 if (tok->start == NULL)
783 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000784 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000785 tok->lineno++;
786 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000787 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000790 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 if (tok->nextprompt != NULL)
792 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000793 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000795 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000796 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 tok->done = E_EOF;
798 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000799#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000800 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000801 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000802#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000804 size_t start = tok->start - tok->buf;
805 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000806 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000807 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000808 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000809 tok->lineno++;
810 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000811 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000812 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000813 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000814 tok->done = E_NOMEM;
815 return EOF;
816 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 tok->buf = buf;
818 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000819 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000820 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000821 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->inp = tok->buf + newlen;
823 tok->end = tok->inp + 1;
824 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000825 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000826 else {
827 tok->lineno++;
828 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000829 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000830 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000831 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000833 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 tok->inp = strchr(tok->buf, '\0');
835 tok->end = tok->inp + 1;
836 }
837 }
838 else {
839 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000840 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000841 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 if (tok->start == NULL) {
843 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000844 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000845 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 if (tok->buf == NULL) {
847 tok->done = E_NOMEM;
848 return EOF;
849 }
850 tok->end = tok->buf + BUFSIZ;
851 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000852 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
853 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 tok->done = E_EOF;
855 done = 1;
856 }
857 else {
858 tok->done = E_OK;
859 tok->inp = strchr(tok->buf, '\0');
860 done = tok->inp[-1] == '\n';
861 }
862 }
863 else {
864 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000865 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000866 tok->done = E_EOF;
867 done = 1;
868 }
869 else
870 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000871 }
872 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000875 Py_ssize_t curstart = tok->start == NULL ? -1 :
876 tok->start - tok->buf;
877 Py_ssize_t curvalid = tok->inp - tok->buf;
878 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000879 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000880 newbuf = (char *)PyMem_REALLOC(newbuf,
881 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 if (newbuf == NULL) {
883 tok->done = E_NOMEM;
884 tok->cur = tok->inp;
885 return EOF;
886 }
887 tok->buf = newbuf;
888 tok->inp = tok->buf + curvalid;
889 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 tok->start = curstart < 0 ? NULL :
891 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000892 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000894 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000895 /* Break out early on decoding
896 errors, as tok->buf will be NULL
897 */
898 if (tok->decoding_erred)
899 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000900 /* Last line does not end in \n,
901 fake one */
902 strcpy(tok->inp, "\n");
903 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000904 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000905 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000906 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000907 if (tok->buf != NULL) {
908 tok->cur = tok->buf + cur;
909 tok->line_start = tok->cur;
910 /* replace "\r\n" with "\n" */
911 /* For Mac leave the \r, giving syntax error */
912 pt = tok->inp - 2;
913 if (pt >= tok->buf && *pt == '\r') {
914 *pt++ = '\n';
915 *pt = '\0';
916 tok->inp = pt;
917 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000918 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919 }
920 if (tok->done != E_OK) {
921 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000922 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000923 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000924 return EOF;
925 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000926 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000927 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000928}
929
930
931/* Back-up one character */
932
933static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000934tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000935{
936 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000937 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000938 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000939 if (*tok->cur != c)
940 *tok->cur = c;
941 }
942}
943
944
945/* Return the token corresponding to a single character */
946
947int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000948PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949{
950 switch (c) {
951 case '(': return LPAR;
952 case ')': return RPAR;
953 case '[': return LSQB;
954 case ']': return RSQB;
955 case ':': return COLON;
956 case ',': return COMMA;
957 case ';': return SEMI;
958 case '+': return PLUS;
959 case '-': return MINUS;
960 case '*': return STAR;
961 case '/': return SLASH;
962 case '|': return VBAR;
963 case '&': return AMPER;
964 case '<': return LESS;
965 case '>': return GREATER;
966 case '=': return EQUAL;
967 case '.': return DOT;
968 case '%': return PERCENT;
969 case '`': return BACKQUOTE;
970 case '{': return LBRACE;
971 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000972 case '^': return CIRCUMFLEX;
973 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000974 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000975 default: return OP;
976 }
977}
978
979
Guido van Rossumfbab9051991-10-20 20:25:03 +0000980int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000981PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000982{
983 switch (c1) {
984 case '=':
985 switch (c2) {
986 case '=': return EQEQUAL;
987 }
988 break;
989 case '!':
990 switch (c2) {
991 case '=': return NOTEQUAL;
992 }
993 break;
994 case '<':
995 switch (c2) {
996 case '>': return NOTEQUAL;
997 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000998 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000999 }
1000 break;
1001 case '>':
1002 switch (c2) {
1003 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001004 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001005 }
1006 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001007 case '+':
1008 switch (c2) {
1009 case '=': return PLUSEQUAL;
1010 }
1011 break;
1012 case '-':
1013 switch (c2) {
1014 case '=': return MINEQUAL;
1015 }
1016 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001017 case '*':
1018 switch (c2) {
1019 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001020 case '=': return STAREQUAL;
1021 }
1022 break;
1023 case '/':
1024 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001025 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001026 case '=': return SLASHEQUAL;
1027 }
1028 break;
1029 case '|':
1030 switch (c2) {
1031 case '=': return VBAREQUAL;
1032 }
1033 break;
1034 case '%':
1035 switch (c2) {
1036 case '=': return PERCENTEQUAL;
1037 }
1038 break;
1039 case '&':
1040 switch (c2) {
1041 case '=': return AMPEREQUAL;
1042 }
1043 break;
1044 case '^':
1045 switch (c2) {
1046 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001047 }
1048 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001049 }
1050 return OP;
1051}
1052
Thomas Wouters434d0822000-08-24 20:11:32 +00001053int
1054PyToken_ThreeChars(int c1, int c2, int c3)
1055{
1056 switch (c1) {
1057 case '<':
1058 switch (c2) {
1059 case '<':
1060 switch (c3) {
1061 case '=':
1062 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001063 }
1064 break;
1065 }
1066 break;
1067 case '>':
1068 switch (c2) {
1069 case '>':
1070 switch (c3) {
1071 case '=':
1072 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001073 }
1074 break;
1075 }
1076 break;
1077 case '*':
1078 switch (c2) {
1079 case '*':
1080 switch (c3) {
1081 case '=':
1082 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 }
1084 break;
1085 }
1086 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001087 case '/':
1088 switch (c2) {
1089 case '/':
1090 switch (c3) {
1091 case '=':
1092 return DOUBLESLASHEQUAL;
1093 }
1094 break;
1095 }
1096 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001097 }
1098 return OP;
1099}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001100
Guido van Rossum926f13a1998-04-09 21:38:06 +00001101static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001102indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001103{
1104 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001105 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001106 tok->cur = tok->inp;
1107 return 1;
1108 }
1109 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001110 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1111 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112 tok->altwarning = 0;
1113 }
1114 return 0;
1115}
1116
1117
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118/* Get next token, after space stripping etc. */
1119
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001120static int
1121tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122{
1123 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001124 int blankline;
1125
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001126 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001127 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001128 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001129 blankline = 0;
1130
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001131 /* Get indentation level */
1132 if (tok->atbol) {
1133 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001134 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136 for (;;) {
1137 c = tok_nextc(tok);
1138 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001139 col++, altcol++;
1140 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001142 altcol = (altcol/tok->alttabsize + 1)
1143 * tok->alttabsize;
1144 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001145 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001146 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001147 else
1148 break;
1149 }
1150 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001151 if (c == '#' || c == '\n') {
1152 /* Lines with only whitespace and/or comments
1153 shouldn't affect the indentation and are
1154 not passed to the parser as NEWLINE tokens,
1155 except *totally* empty lines in interactive
1156 mode, which signal the end of a command group. */
1157 if (col == 0 && c == '\n' && tok->prompt != NULL)
1158 blankline = 0; /* Let it through */
1159 else
1160 blankline = 1; /* Ignore completely */
1161 /* We can't jump back right here since we still
1162 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001163 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001164 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001165 if (col == tok->indstack[tok->indent]) {
1166 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001167 if (altcol != tok->altindstack[tok->indent]) {
1168 if (indenterror(tok))
1169 return ERRORTOKEN;
1170 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001171 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 else if (col > tok->indstack[tok->indent]) {
1173 /* Indent -- always one */
1174 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001175 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001176 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 return ERRORTOKEN;
1178 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001179 if (altcol <= tok->altindstack[tok->indent]) {
1180 if (indenterror(tok))
1181 return ERRORTOKEN;
1182 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001183 tok->pendin++;
1184 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001185 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001186 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001187 else /* col < tok->indstack[tok->indent] */ {
1188 /* Dedent -- any number, must be consistent */
1189 while (tok->indent > 0 &&
1190 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001191 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001192 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001193 }
1194 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001195 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001196 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001197 return ERRORTOKEN;
1198 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001199 if (altcol != tok->altindstack[tok->indent]) {
1200 if (indenterror(tok))
1201 return ERRORTOKEN;
1202 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203 }
1204 }
1205 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001206
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001207 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001208
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 /* Return pending indents/dedents */
1210 if (tok->pendin != 0) {
1211 if (tok->pendin < 0) {
1212 tok->pendin++;
1213 return DEDENT;
1214 }
1215 else {
1216 tok->pendin--;
1217 return INDENT;
1218 }
1219 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001220
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001222 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 /* Skip spaces */
1224 do {
1225 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001226 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001227
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001228 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001229 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001230
Guido van Rossumab5ca152000-03-31 00:52:27 +00001231 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001233 static char *tabforms[] = {
1234 "tab-width:", /* Emacs */
1235 ":tabstop=", /* vim, full form */
1236 ":ts=", /* vim, abbreviated form */
1237 "set tabsize=", /* will vi never die? */
1238 /* more templates can be added here to support other editors */
1239 };
1240 char cbuf[80];
1241 char *tp, **cp;
1242 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001243 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001244 *tp++ = c = tok_nextc(tok);
1245 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001246 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001247 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001248 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001249 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1250 cp++) {
1251 if ((tp = strstr(cbuf, *cp))) {
1252 int newsize = atoi(tp + strlen(*cp));
1253
1254 if (newsize >= 1 && newsize <= 40) {
1255 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001256 if (Py_VerboseFlag)
1257 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001258 "Tab size set to %d\n",
1259 newsize);
1260 }
1261 }
1262 }
1263 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001265 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001266
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001268 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001270 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001271
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 /* Identifier (most frequent token!) */
1273 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001274 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001275 switch (c) {
1276 case 'r':
1277 case 'R':
1278 c = tok_nextc(tok);
1279 if (c == '"' || c == '\'')
1280 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001281 break;
1282 case 'u':
1283 case 'U':
1284 c = tok_nextc(tok);
1285 if (c == 'r' || c == 'R')
1286 c = tok_nextc(tok);
1287 if (c == '"' || c == '\'')
1288 goto letter_quote;
1289 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001290 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001291 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001293 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001295 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 *p_end = tok->cur;
1297 return NAME;
1298 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001299
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 /* Newline */
1301 if (c == '\n') {
1302 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001303 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001304 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001305 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001307 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 return NEWLINE;
1309 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001310
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001311 /* Period or number starting with period? */
1312 if (c == '.') {
1313 c = tok_nextc(tok);
1314 if (isdigit(c)) {
1315 goto fraction;
1316 }
1317 else {
1318 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001319 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001320 *p_end = tok->cur;
1321 return DOT;
1322 }
1323 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001324
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001325 /* Number */
1326 if (isdigit(c)) {
1327 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001328 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 c = tok_nextc(tok);
1330 if (c == '.')
1331 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001332#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001333 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001334 goto imaginary;
1335#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 if (c == 'x' || c == 'X') {
1337 /* Hex */
1338 do {
1339 c = tok_nextc(tok);
1340 } while (isxdigit(c));
1341 }
1342 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001343 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 /* Octal; c is first char of it */
1345 /* There's no 'isoctdigit' macro, sigh */
1346 while ('0' <= c && c < '8') {
1347 c = tok_nextc(tok);
1348 }
Tim Petersd507dab2001-08-30 20:51:59 +00001349 if (isdigit(c)) {
1350 found_decimal = 1;
1351 do {
1352 c = tok_nextc(tok);
1353 } while (isdigit(c));
1354 }
1355 if (c == '.')
1356 goto fraction;
1357 else if (c == 'e' || c == 'E')
1358 goto exponent;
1359#ifndef WITHOUT_COMPLEX
1360 else if (c == 'j' || c == 'J')
1361 goto imaginary;
1362#endif
1363 else if (found_decimal) {
1364 tok->done = E_TOKEN;
1365 tok_backup(tok, c);
1366 return ERRORTOKEN;
1367 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001369 if (c == 'l' || c == 'L')
1370 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 }
1372 else {
1373 /* Decimal */
1374 do {
1375 c = tok_nextc(tok);
1376 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001377 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001378 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001379 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001380 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001381 if (c == '.') {
1382 fraction:
1383 /* Fraction */
1384 do {
1385 c = tok_nextc(tok);
1386 } while (isdigit(c));
1387 }
1388 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001389 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001390 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001392 if (c == '+' || c == '-')
1393 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001394 if (!isdigit(c)) {
1395 tok->done = E_TOKEN;
1396 tok_backup(tok, c);
1397 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001398 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001399 do {
1400 c = tok_nextc(tok);
1401 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001403#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001404 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001405 /* Imaginary part */
1406 imaginary:
1407 c = tok_nextc(tok);
1408#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 }
1410 }
1411 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001412 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 *p_end = tok->cur;
1414 return NUMBER;
1415 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001416
1417 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001418 /* String */
1419 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001420 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001421 int quote = c;
1422 int triple = 0;
1423 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 for (;;) {
1425 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001426 if (c == '\n') {
1427 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001428 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 tok_backup(tok, c);
1430 return ERRORTOKEN;
1431 }
1432 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001433 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001434 }
1435 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001436 if (triple)
1437 tok->done = E_EOFS;
1438 else
1439 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001440 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 return ERRORTOKEN;
1442 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001443 else if (c == quote) {
1444 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001445 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001446 c = tok_nextc(tok);
1447 if (c == quote) {
1448 triple = 1;
1449 tripcount = 0;
1450 continue;
1451 }
1452 tok_backup(tok, c);
1453 }
1454 if (!triple || tripcount == 3)
1455 break;
1456 }
1457 else if (c == '\\') {
1458 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001461 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001462 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 return ERRORTOKEN;
1464 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001465 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001466 else
1467 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001468 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001469 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001470 *p_end = tok->cur;
1471 return STRING;
1472 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001473
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 /* Line continuation */
1475 if (c == '\\') {
1476 c = tok_nextc(tok);
1477 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001478 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001479 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001480 return ERRORTOKEN;
1481 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001482 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001483 goto again; /* Read next line */
1484 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001485
Guido van Rossumfbab9051991-10-20 20:25:03 +00001486 /* Check for two-character token */
1487 {
1488 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001489 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001490 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001491 int c3 = tok_nextc(tok);
1492 int token3 = PyToken_ThreeChars(c, c2, c3);
1493 if (token3 != OP) {
1494 token = token3;
1495 } else {
1496 tok_backup(tok, c3);
1497 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001498 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001499 *p_end = tok->cur;
1500 return token;
1501 }
1502 tok_backup(tok, c2);
1503 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001504
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001505 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001506 switch (c) {
1507 case '(':
1508 case '[':
1509 case '{':
1510 tok->level++;
1511 break;
1512 case ')':
1513 case ']':
1514 case '}':
1515 tok->level--;
1516 break;
1517 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001518
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001519 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001520 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001522 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523}
1524
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001525int
1526PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1527{
1528 int result = tok_get(tok, p_start, p_end);
1529 if (tok->decoding_erred) {
1530 result = ERRORTOKEN;
1531 tok->done = E_DECODE;
1532 }
1533 return result;
1534}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001535
Martin v. Löwisfc787d52007-09-04 14:20:25 +00001536/* This function is only called from parsetok. However, it cannot live
1537 there, as it must be empty for PGEN, and we can check for PGEN only
1538 in this file. */
1539
Christian Heimesd2f4cb82008-01-23 14:20:41 +00001540#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisfc787d52007-09-04 14:20:25 +00001541char*
1542PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1543{
1544 return NULL;
1545}
1546#else
1547static PyObject *
1548dec_utf8(const char *enc, const char *text, size_t len) {
1549 PyObject *ret = NULL;
1550 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1551 if (unicode_text) {
1552 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1553 Py_DECREF(unicode_text);
1554 }
1555 if (!ret) {
Guido van Rossume15fab42007-11-15 20:39:53 +00001556 PyErr_Clear();
Martin v. Löwisfc787d52007-09-04 14:20:25 +00001557 }
1558 return ret;
1559}
Martin v. Löwisfc787d52007-09-04 14:20:25 +00001560char *
1561PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1562{
1563 char *text = NULL;
1564 if (tok->encoding) {
1565 /* convert source to original encondig */
1566 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1567 if (lineobj != NULL) {
1568 int linelen = PyString_Size(lineobj);
1569 const char *line = PyString_AsString(lineobj);
1570 text = PyObject_MALLOC(linelen + 1);
1571 if (text != NULL && line != NULL) {
1572 if (linelen)
1573 strncpy(text, line, linelen);
1574 text[linelen] = '\0';
1575 }
1576 Py_DECREF(lineobj);
1577
1578 /* adjust error offset */
1579 if (*offset > 1) {
1580 PyObject *offsetobj = dec_utf8(tok->encoding,
1581 tok->buf, *offset-1);
1582 if (offsetobj) {
1583 *offset = PyString_Size(offsetobj) + 1;
1584 Py_DECREF(offsetobj);
1585 }
1586 }
1587
1588 }
1589 }
1590 return text;
1591
1592}
1593#endif
1594
1595
1596
Guido van Rossum408027e1996-12-30 16:17:54 +00001597#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001598
1599void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001600tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001601{
Guido van Rossum86bea461997-04-29 21:03:06 +00001602 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001603 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1604 printf("(%.*s)", (int)(end - start), start);
1605}
1606
1607#endif