blob: c58b6899b37e6b4853c03bcd560d59f0a8252b63 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Anthony Baxter11490022006-04-11 05:39:14 +0000108 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
109 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110 if (tok == NULL)
111 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000112 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000113 tok->done = E_OK;
114 tok->fp = NULL;
115 tok->tabsize = TABSIZE;
116 tok->indent = 0;
117 tok->indstack[0] = 0;
118 tok->atbol = 1;
119 tok->pendin = 0;
120 tok->prompt = tok->nextprompt = NULL;
121 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000122 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000123 tok->filename = NULL;
124 tok->altwarning = 0;
125 tok->alterror = 0;
126 tok->alttabsize = 1;
127 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000128 tok->decoding_state = 0;
129 tok->decoding_erred = 0;
130 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000167 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000173new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174{
Neal Norwitz08062d62006-04-11 08:19:15 +0000175 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000211 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000232 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000240 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000281 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000287 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000301 return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313{
314 int ch = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch == EOF) {
317 return 1;
318 } else if (ch == 0xEF) {
319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321#if 0
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch == 0xFE) {
325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326 if (!set_readline(tok, "utf-16-be")) return 0;
327 tok->decoding_state = -1;
328 } else if (ch == 0xFF) {
329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-le")) return 0;
331 tok->decoding_state = -1;
332#endif
333 } else {
334 unget_char(ch, tok);
335 return 1;
336 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000337 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000338 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340 return 1;
341 NON_BOM:
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344 return 1;
345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000349
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000358 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 reached): see tok_nextc and its calls to decoding_fgets.
360*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000365#ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000367 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000368 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000369#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000373 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374
375 /* Ask for one less byte so we can terminate it */
376 assert(size > 0);
377 size--;
378
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000381 if (buf == NULL)
382 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383 } else {
384 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 if (PyString_CheckExact(buf))
386 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 if (utf8 == NULL) {
389 utf8 = PyUnicode_AsUTF8String(buf);
390 Py_DECREF(buf);
391 if (utf8 == NULL)
392 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000393 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 str = PyString_AsString(utf8);
395 utf8len = PyString_GET_SIZE(utf8);
396 if (utf8len > size) {
397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398 if (tok->decoding_buffer == NULL) {
399 Py_DECREF(utf8);
400 return error_ret(tok);
401 }
402 utf8len = size;
403 }
404 memcpy(s, str, utf8len);
405 s[utf8len] = '\0';
406 Py_DECREF(utf8);
407 if (utf8len == 0) return NULL; /* EOF */
408 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000409#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410}
411
412/* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
414
415 This function is called from check_bom and check_coding_spec.
416
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
419
420 Return 1 on success, 0 on failure. */
421
422static int
423fp_setreadl(struct tok_state *tok, const char* enc)
424{
425 PyObject *reader, *stream, *readline;
426
Martin v. Löwis95292d62002-12-11 14:04:59 +0000427 /* XXX: constify filename argument. */
428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 if (stream == NULL)
430 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000431
432 reader = PyCodec_StreamReader(enc, stream, NULL);
433 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000434 if (reader == NULL)
435 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
437 readline = PyObject_GetAttrString(reader, "readline");
438 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000439 if (readline == NULL)
440 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441
442 tok->decoding_readline = readline;
443 return 1;
444}
445
446/* Fetch the next byte from TOK. */
447
448static int fp_getc(struct tok_state *tok) {
449 return getc(tok->fp);
450}
451
452/* Unfetch the last byte back into TOK. */
453
454static void fp_ungetc(int c, struct tok_state *tok) {
455 ungetc(c, tok->fp);
456}
457
458/* Read a line of input from TOK. Determine encoding
459 if necessary. */
460
461static char *
462decoding_fgets(char *s, int size, struct tok_state *tok)
463{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000464 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000465 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000466 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467 if (tok->decoding_state < 0) {
468 /* We already have a codec associated with
469 this input. */
470 line = fp_readl(s, size, tok);
471 break;
472 } else if (tok->decoding_state > 0) {
473 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000474 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 break;
477 } else {
478 /* We have not yet determined the encoding.
479 If an encoding is found, use the file-pointer
480 reader functions from now on. */
481 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482 return error_ret(tok);
483 assert(tok->decoding_state != 0);
484 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000485 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488 return error_ret(tok);
489 }
490 }
491#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000492 /* The default encoding is ASCII, so make sure we don't have any
493 non-ASCII bytes in it. */
494 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000496 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000497 if (*c > 127) {
498 badchar = *c;
499 break;
500 }
501 }
502 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000503 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000504 /* Need to add 1 to the line number, since this line
505 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000506 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000507 "Non-ASCII character '\\x%.2x' "
508 "in file %.200s on line %i, "
509 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000510 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000511 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000512 PyErr_SetString(PyExc_SyntaxError, buf);
513 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514 }
515#endif
516 return line;
517}
518
519static int
520decoding_feof(struct tok_state *tok)
521{
522 if (tok->decoding_state >= 0) {
523 return feof(tok->fp);
524 } else {
525 PyObject* buf = tok->decoding_buffer;
526 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000527 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 if (buf == NULL) {
529 error_ret(tok);
530 return 1;
531 } else {
532 tok->decoding_buffer = buf;
533 }
534 }
535 return PyObject_Length(buf) == 0;
536 }
537}
538
539/* Fetch a byte from TOK, using the string buffer. */
540
Tim Petersc9d78aa2006-03-26 23:27:58 +0000541static int
542buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000543 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544}
545
546/* Unfetch a byte from TOK, using the string buffer. */
547
Tim Petersc9d78aa2006-03-26 23:27:58 +0000548static void
549buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000551 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
554/* Set the readline function for TOK to ENC. For the string-based
555 tokenizer, this means to just record the encoding. */
556
Tim Petersc9d78aa2006-03-26 23:27:58 +0000557static int
558buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 tok->enc = enc;
560 return 1;
561}
562
563/* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
565
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567static PyObject *
568translate_into_utf8(const char* str, const char* enc) {
569 PyObject *utf8;
570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571 if (buf == NULL)
572 return NULL;
573 utf8 = PyUnicode_AsUTF8String(buf);
574 Py_DECREF(buf);
575 return utf8;
576}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000577#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578
579/* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
581 inside TOK. */
582
583static const char *
584decode_str(const char *str, struct tok_state *tok)
585{
586 PyObject* utf8 = NULL;
587 const char *s;
588 int lineno = 0;
589 tok->enc = NULL;
590 tok->str = str;
591 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000592 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000594 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000595#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 if (tok->enc != NULL) {
597 utf8 = translate_into_utf8(str, tok->enc);
598 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000599 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600 str = PyString_AsString(utf8);
601 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000602#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 for (s = str;; s++) {
604 if (*s == '\0') break;
605 else if (*s == '\n') {
606 lineno++;
607 if (lineno == 2) break;
608 }
609 }
610 tok->enc = NULL;
611 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000612 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000613#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614 if (tok->enc != NULL) {
615 assert(utf8 == NULL);
616 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000617 if (utf8 == NULL) {
618 PyErr_Format(PyExc_SyntaxError,
619 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000620 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000621 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 str = PyString_AsString(utf8);
623 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000624#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 assert(tok->decoding_buffer == NULL);
626 tok->decoding_buffer = utf8; /* CAUTION */
627 return str;
628}
629
630#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631
632/* Set up tokenizer for string */
633
634struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000635PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636{
637 struct tok_state *tok = tok_new();
638 if (tok == NULL)
639 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000641 if (str == NULL) {
642 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000644 }
645
Martin v. Löwis95292d62002-12-11 14:04:59 +0000646 /* XXX: constify members. */
647 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 return tok;
649}
650
651
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000652/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653
654struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000655PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656{
657 struct tok_state *tok = tok_new();
658 if (tok == NULL)
659 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000660 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000661 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000662 return NULL;
663 }
664 tok->cur = tok->inp = tok->buf;
665 tok->end = tok->buf + BUFSIZ;
666 tok->fp = fp;
667 tok->prompt = ps1;
668 tok->nextprompt = ps2;
669 return tok;
670}
671
672
673/* Free a tok_state structure */
674
675void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000676PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000678 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000679 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000680#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000681 Py_XDECREF(tok->decoding_readline);
682 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000683#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000685 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000686 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687}
688
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000689#if !defined(PGEN) && defined(Py_USING_UNICODE)
690static int
691tok_stdin_decode(struct tok_state *tok, char **inp)
692{
693 PyObject *enc, *sysstdin, *decoded, *utf8;
694 const char *encoding;
695 char *converted;
696
697 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698 return 0;
699 sysstdin = PySys_GetObject("stdin");
700 if (sysstdin == NULL || !PyFile_Check(sysstdin))
701 return 0;
702
703 enc = ((PyFileObject *)sysstdin)->f_encoding;
704 if (enc == NULL || !PyString_Check(enc))
705 return 0;
706 Py_INCREF(enc);
707
708 encoding = PyString_AsString(enc);
709 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
710 if (decoded == NULL)
711 goto error_clear;
712
713 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714 Py_DECREF(decoded);
715 if (utf8 == NULL)
716 goto error_clear;
717
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000718 assert(PyString_Check(utf8));
719 converted = new_string(PyString_AS_STRING(utf8),
720 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000721 Py_DECREF(utf8);
722 if (converted == NULL)
723 goto error_nomem;
724
Neal Norwitz08062d62006-04-11 08:19:15 +0000725 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000726 *inp = converted;
727 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000728 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000729 tok->encoding = new_string(encoding, strlen(encoding));
730 if (tok->encoding == NULL)
731 goto error_nomem;
732
733 Py_DECREF(enc);
734 return 0;
735
736error_nomem:
737 Py_DECREF(enc);
738 tok->done = E_NOMEM;
739 return -1;
740
741error_clear:
742 /* Fallback to iso-8859-1: for backward compatibility */
743 Py_DECREF(enc);
744 PyErr_Clear();
745 return 0;
746}
747#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748
749/* Get next char, updating state; error code goes into tok->done */
750
751static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000752tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000755 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000756 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000757 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000758 if (tok->done != E_OK)
759 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000761 char *end = strchr(tok->inp, '\n');
762 if (end != NULL)
763 end++;
764 else {
765 end = strchr(tok->inp, '\0');
766 if (end == tok->inp) {
767 tok->done = E_EOF;
768 return EOF;
769 }
770 }
771 if (tok->start == NULL)
772 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000773 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000774 tok->lineno++;
775 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000776 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000779 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780 if (tok->nextprompt != NULL)
781 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000782 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000784 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000785 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000786 tok->done = E_EOF;
787 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000788#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000789 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000790 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000791#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000792 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000793 size_t start = tok->start - tok->buf;
794 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000795 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000796 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000797 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000798 tok->lineno++;
799 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000800 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000801 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000802 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000803 tok->done = E_NOMEM;
804 return EOF;
805 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000806 tok->buf = buf;
807 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000808 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000809 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000810 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 tok->inp = tok->buf + newlen;
812 tok->end = tok->inp + 1;
813 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000814 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000815 else {
816 tok->lineno++;
817 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000818 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000819 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000820 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000822 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 tok->inp = strchr(tok->buf, '\0');
824 tok->end = tok->inp + 1;
825 }
826 }
827 else {
828 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000829 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000830 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 if (tok->start == NULL) {
832 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000833 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000834 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000835 if (tok->buf == NULL) {
836 tok->done = E_NOMEM;
837 return EOF;
838 }
839 tok->end = tok->buf + BUFSIZ;
840 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000841 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
842 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000843 tok->done = E_EOF;
844 done = 1;
845 }
846 else {
847 tok->done = E_OK;
848 tok->inp = strchr(tok->buf, '\0');
849 done = tok->inp[-1] == '\n';
850 }
851 }
852 else {
853 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000854 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000855 tok->done = E_EOF;
856 done = 1;
857 }
858 else
859 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 }
861 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000862 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000863 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000864 Py_ssize_t curstart = tok->start == NULL ? -1 :
865 tok->start - tok->buf;
866 Py_ssize_t curvalid = tok->inp - tok->buf;
867 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000868 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000869 newbuf = (char *)PyMem_REALLOC(newbuf,
870 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000871 if (newbuf == NULL) {
872 tok->done = E_NOMEM;
873 tok->cur = tok->inp;
874 return EOF;
875 }
876 tok->buf = newbuf;
877 tok->inp = tok->buf + curvalid;
878 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 tok->start = curstart < 0 ? NULL :
880 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000881 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000883 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000884 /* Break out early on decoding
885 errors, as tok->buf will be NULL
886 */
887 if (tok->decoding_erred)
888 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 /* Last line does not end in \n,
890 fake one */
891 strcpy(tok->inp, "\n");
892 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000894 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000896 if (tok->buf != NULL) {
897 tok->cur = tok->buf + cur;
898 tok->line_start = tok->cur;
899 /* replace "\r\n" with "\n" */
900 /* For Mac leave the \r, giving syntax error */
901 pt = tok->inp - 2;
902 if (pt >= tok->buf && *pt == '\r') {
903 *pt++ = '\n';
904 *pt = '\0';
905 tok->inp = pt;
906 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000907 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908 }
909 if (tok->done != E_OK) {
910 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000911 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000912 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913 return EOF;
914 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000915 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000916 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000917}
918
919
920/* Back-up one character */
921
922static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000923tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000924{
925 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000926 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000927 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000928 if (*tok->cur != c)
929 *tok->cur = c;
930 }
931}
932
933
934/* Return the token corresponding to a single character */
935
936int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000937PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000938{
939 switch (c) {
940 case '(': return LPAR;
941 case ')': return RPAR;
942 case '[': return LSQB;
943 case ']': return RSQB;
944 case ':': return COLON;
945 case ',': return COMMA;
946 case ';': return SEMI;
947 case '+': return PLUS;
948 case '-': return MINUS;
949 case '*': return STAR;
950 case '/': return SLASH;
951 case '|': return VBAR;
952 case '&': return AMPER;
953 case '<': return LESS;
954 case '>': return GREATER;
955 case '=': return EQUAL;
956 case '.': return DOT;
957 case '%': return PERCENT;
958 case '`': return BACKQUOTE;
959 case '{': return LBRACE;
960 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000961 case '^': return CIRCUMFLEX;
962 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000963 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000964 default: return OP;
965 }
966}
967
968
Guido van Rossumfbab9051991-10-20 20:25:03 +0000969int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000970PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000971{
972 switch (c1) {
973 case '=':
974 switch (c2) {
975 case '=': return EQEQUAL;
976 }
977 break;
978 case '!':
979 switch (c2) {
980 case '=': return NOTEQUAL;
981 }
982 break;
983 case '<':
984 switch (c2) {
985 case '>': return NOTEQUAL;
986 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000987 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000988 }
989 break;
990 case '>':
991 switch (c2) {
992 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000993 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000994 }
995 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000996 case '+':
997 switch (c2) {
998 case '=': return PLUSEQUAL;
999 }
1000 break;
1001 case '-':
1002 switch (c2) {
1003 case '=': return MINEQUAL;
1004 }
1005 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001006 case '*':
1007 switch (c2) {
1008 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 case '=': return STAREQUAL;
1010 }
1011 break;
1012 case '/':
1013 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001014 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001015 case '=': return SLASHEQUAL;
1016 }
1017 break;
1018 case '|':
1019 switch (c2) {
1020 case '=': return VBAREQUAL;
1021 }
1022 break;
1023 case '%':
1024 switch (c2) {
1025 case '=': return PERCENTEQUAL;
1026 }
1027 break;
1028 case '&':
1029 switch (c2) {
1030 case '=': return AMPEREQUAL;
1031 }
1032 break;
1033 case '^':
1034 switch (c2) {
1035 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001036 }
1037 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001038 }
1039 return OP;
1040}
1041
Thomas Wouters434d0822000-08-24 20:11:32 +00001042int
1043PyToken_ThreeChars(int c1, int c2, int c3)
1044{
1045 switch (c1) {
1046 case '<':
1047 switch (c2) {
1048 case '<':
1049 switch (c3) {
1050 case '=':
1051 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001052 }
1053 break;
1054 }
1055 break;
1056 case '>':
1057 switch (c2) {
1058 case '>':
1059 switch (c3) {
1060 case '=':
1061 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001062 }
1063 break;
1064 }
1065 break;
1066 case '*':
1067 switch (c2) {
1068 case '*':
1069 switch (c3) {
1070 case '=':
1071 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001072 }
1073 break;
1074 }
1075 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001076 case '/':
1077 switch (c2) {
1078 case '/':
1079 switch (c3) {
1080 case '=':
1081 return DOUBLESLASHEQUAL;
1082 }
1083 break;
1084 }
1085 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001086 }
1087 return OP;
1088}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001089
Guido van Rossum926f13a1998-04-09 21:38:06 +00001090static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001091indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001092{
1093 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001094 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095 tok->cur = tok->inp;
1096 return 1;
1097 }
1098 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001099 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001101 tok->altwarning = 0;
1102 }
1103 return 0;
1104}
1105
1106
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001107/* Get next token, after space stripping etc. */
1108
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001109static int
1110tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001111{
1112 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001113 int blankline;
1114
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001115 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001116 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001117 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 blankline = 0;
1119
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 /* Get indentation level */
1121 if (tok->atbol) {
1122 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001124 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001125 for (;;) {
1126 c = tok_nextc(tok);
1127 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001128 col++, altcol++;
1129 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001131 altcol = (altcol/tok->alttabsize + 1)
1132 * tok->alttabsize;
1133 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001134 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001135 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136 else
1137 break;
1138 }
1139 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001140 if (c == '#' || c == '\n') {
1141 /* Lines with only whitespace and/or comments
1142 shouldn't affect the indentation and are
1143 not passed to the parser as NEWLINE tokens,
1144 except *totally* empty lines in interactive
1145 mode, which signal the end of a command group. */
1146 if (col == 0 && c == '\n' && tok->prompt != NULL)
1147 blankline = 0; /* Let it through */
1148 else
1149 blankline = 1; /* Ignore completely */
1150 /* We can't jump back right here since we still
1151 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001152 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001153 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154 if (col == tok->indstack[tok->indent]) {
1155 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001156 if (altcol != tok->altindstack[tok->indent]) {
1157 if (indenterror(tok))
1158 return ERRORTOKEN;
1159 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001161 else if (col > tok->indstack[tok->indent]) {
1162 /* Indent -- always one */
1163 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001164 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001165 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001166 return ERRORTOKEN;
1167 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001168 if (altcol <= tok->altindstack[tok->indent]) {
1169 if (indenterror(tok))
1170 return ERRORTOKEN;
1171 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 tok->pendin++;
1173 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001174 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001176 else /* col < tok->indstack[tok->indent] */ {
1177 /* Dedent -- any number, must be consistent */
1178 while (tok->indent > 0 &&
1179 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001180 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001181 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001182 }
1183 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001184 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001185 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001186 return ERRORTOKEN;
1187 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001188 if (altcol != tok->altindstack[tok->indent]) {
1189 if (indenterror(tok))
1190 return ERRORTOKEN;
1191 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192 }
1193 }
1194 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001195
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001196 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001197
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 /* Return pending indents/dedents */
1199 if (tok->pendin != 0) {
1200 if (tok->pendin < 0) {
1201 tok->pendin++;
1202 return DEDENT;
1203 }
1204 else {
1205 tok->pendin--;
1206 return INDENT;
1207 }
1208 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001209
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001210 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001211 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 /* Skip spaces */
1213 do {
1214 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001215 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001216
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001218 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001219
Guido van Rossumab5ca152000-03-31 00:52:27 +00001220 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001222 static char *tabforms[] = {
1223 "tab-width:", /* Emacs */
1224 ":tabstop=", /* vim, full form */
1225 ":ts=", /* vim, abbreviated form */
1226 "set tabsize=", /* will vi never die? */
1227 /* more templates can be added here to support other editors */
1228 };
1229 char cbuf[80];
1230 char *tp, **cp;
1231 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001233 *tp++ = c = tok_nextc(tok);
1234 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001235 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001236 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001237 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001238 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239 cp++) {
1240 if ((tp = strstr(cbuf, *cp))) {
1241 int newsize = atoi(tp + strlen(*cp));
1242
1243 if (newsize >= 1 && newsize <= 40) {
1244 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001245 if (Py_VerboseFlag)
1246 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001247 "Tab size set to %d\n",
1248 newsize);
1249 }
1250 }
1251 }
1252 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001255
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001257 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001259 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001260
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 /* Identifier (most frequent token!) */
1262 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001263 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001264 switch (c) {
1265 case 'r':
1266 case 'R':
1267 c = tok_nextc(tok);
1268 if (c == '"' || c == '\'')
1269 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001270 break;
1271 case 'u':
1272 case 'U':
1273 c = tok_nextc(tok);
1274 if (c == 'r' || c == 'R')
1275 c = tok_nextc(tok);
1276 if (c == '"' || c == '\'')
1277 goto letter_quote;
1278 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001279 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001280 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001282 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001284 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 *p_end = tok->cur;
1286 return NAME;
1287 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001288
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 /* Newline */
1290 if (c == '\n') {
1291 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001292 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001293 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001294 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001296 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001297 return NEWLINE;
1298 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001299
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001300 /* Period or number starting with period? */
1301 if (c == '.') {
1302 c = tok_nextc(tok);
1303 if (isdigit(c)) {
1304 goto fraction;
1305 }
1306 else {
1307 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001308 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001309 *p_end = tok->cur;
1310 return DOT;
1311 }
1312 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001313
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 /* Number */
1315 if (isdigit(c)) {
1316 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001317 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 c = tok_nextc(tok);
1319 if (c == '.')
1320 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001321#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001322 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001323 goto imaginary;
1324#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001325 if (c == 'x' || c == 'X') {
1326 /* Hex */
1327 do {
1328 c = tok_nextc(tok);
1329 } while (isxdigit(c));
1330 }
1331 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001332 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 /* Octal; c is first char of it */
1334 /* There's no 'isoctdigit' macro, sigh */
1335 while ('0' <= c && c < '8') {
1336 c = tok_nextc(tok);
1337 }
Tim Petersd507dab2001-08-30 20:51:59 +00001338 if (isdigit(c)) {
1339 found_decimal = 1;
1340 do {
1341 c = tok_nextc(tok);
1342 } while (isdigit(c));
1343 }
1344 if (c == '.')
1345 goto fraction;
1346 else if (c == 'e' || c == 'E')
1347 goto exponent;
1348#ifndef WITHOUT_COMPLEX
1349 else if (c == 'j' || c == 'J')
1350 goto imaginary;
1351#endif
1352 else if (found_decimal) {
1353 tok->done = E_TOKEN;
1354 tok_backup(tok, c);
1355 return ERRORTOKEN;
1356 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001358 if (c == 'l' || c == 'L')
1359 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 }
1361 else {
1362 /* Decimal */
1363 do {
1364 c = tok_nextc(tok);
1365 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001366 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001368 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001369 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001370 if (c == '.') {
1371 fraction:
1372 /* Fraction */
1373 do {
1374 c = tok_nextc(tok);
1375 } while (isdigit(c));
1376 }
1377 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001378 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001379 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001381 if (c == '+' || c == '-')
1382 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001383 if (!isdigit(c)) {
1384 tok->done = E_TOKEN;
1385 tok_backup(tok, c);
1386 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001387 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001388 do {
1389 c = tok_nextc(tok);
1390 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001392#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001393 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001394 /* Imaginary part */
1395 imaginary:
1396 c = tok_nextc(tok);
1397#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001398 }
1399 }
1400 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001401 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 *p_end = tok->cur;
1403 return NUMBER;
1404 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001405
1406 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 /* String */
1408 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 int quote = c;
1411 int triple = 0;
1412 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 for (;;) {
1414 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 if (c == '\n') {
1416 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001417 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001418 tok_backup(tok, c);
1419 return ERRORTOKEN;
1420 }
1421 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001422 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001423 }
1424 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001425 if (triple)
1426 tok->done = E_EOFS;
1427 else
1428 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001429 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 return ERRORTOKEN;
1431 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001432 else if (c == quote) {
1433 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001434 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001435 c = tok_nextc(tok);
1436 if (c == quote) {
1437 triple = 1;
1438 tripcount = 0;
1439 continue;
1440 }
1441 tok_backup(tok, c);
1442 }
1443 if (!triple || tripcount == 3)
1444 break;
1445 }
1446 else if (c == '\\') {
1447 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001448 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001449 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001450 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001451 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001452 return ERRORTOKEN;
1453 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001454 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001455 else
1456 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001457 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001458 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001459 *p_end = tok->cur;
1460 return STRING;
1461 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001462
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 /* Line continuation */
1464 if (c == '\\') {
1465 c = tok_nextc(tok);
1466 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001467 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001468 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469 return ERRORTOKEN;
1470 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001471 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001472 goto again; /* Read next line */
1473 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001474
Guido van Rossumfbab9051991-10-20 20:25:03 +00001475 /* Check for two-character token */
1476 {
1477 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001478 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001479 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001480 int c3 = tok_nextc(tok);
1481 int token3 = PyToken_ThreeChars(c, c2, c3);
1482 if (token3 != OP) {
1483 token = token3;
1484 } else {
1485 tok_backup(tok, c3);
1486 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001487 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001488 *p_end = tok->cur;
1489 return token;
1490 }
1491 tok_backup(tok, c2);
1492 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001493
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001494 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001495 switch (c) {
1496 case '(':
1497 case '[':
1498 case '{':
1499 tok->level++;
1500 break;
1501 case ')':
1502 case ']':
1503 case '}':
1504 tok->level--;
1505 break;
1506 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001507
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001508 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001509 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001511 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001512}
1513
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001514int
1515PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1516{
1517 int result = tok_get(tok, p_start, p_end);
1518 if (tok->decoding_erred) {
1519 result = ERRORTOKEN;
1520 tok->done = E_DECODE;
1521 }
1522 return result;
1523}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001524
Guido van Rossum408027e1996-12-30 16:17:54 +00001525#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526
1527void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001528tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001529{
Guido van Rossum86bea461997-04-29 21:03:06 +00001530 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001531 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1532 printf("(%.*s)", (int)(end - start), start);
1533}
1534
1535#endif