blob: dee8e846a19484c4284f79a1d56c59f5a6244f74 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Guido van Rossum86bea461997-04-29 21:03:06 +0000108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
130 tok->issued_encoding_warning = 0;
131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_DEL(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000173new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174{
175 char* result = PyMem_NEW(char, len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000211 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000232 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 PyMem_DEL(r);
241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
281 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000287 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
292 PyMem_DEL(cs);
293 }
294 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000301 return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313{
314 int ch = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch == EOF) {
317 return 1;
318 } else if (ch == 0xEF) {
319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321#if 0
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch == 0xFE) {
325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326 if (!set_readline(tok, "utf-16-be")) return 0;
327 tok->decoding_state = -1;
328 } else if (ch == 0xFF) {
329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-le")) return 0;
331 tok->decoding_state = -1;
332#endif
333 } else {
334 unget_char(ch, tok);
335 return 1;
336 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000337 if (tok->encoding != NULL)
338 PyMem_DEL(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340 return 1;
341 NON_BOM:
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344 return 1;
345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348 Return NULL on failure, else S.
349
350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
358 until the buffer ends with a '\n' (or until the end of the file is
359 reached): see tok_nextc and its calls to decoding_fgets.
360*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000365#ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000367 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000368 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000369#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000373 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374
375 /* Ask for one less byte so we can terminate it */
376 assert(size > 0);
377 size--;
378
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000381 if (buf == NULL)
382 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383 } else {
384 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 if (PyString_CheckExact(buf))
386 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 if (utf8 == NULL) {
389 utf8 = PyUnicode_AsUTF8String(buf);
390 Py_DECREF(buf);
391 if (utf8 == NULL)
392 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000393 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 str = PyString_AsString(utf8);
395 utf8len = PyString_GET_SIZE(utf8);
396 if (utf8len > size) {
397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398 if (tok->decoding_buffer == NULL) {
399 Py_DECREF(utf8);
400 return error_ret(tok);
401 }
402 utf8len = size;
403 }
404 memcpy(s, str, utf8len);
405 s[utf8len] = '\0';
406 Py_DECREF(utf8);
407 if (utf8len == 0) return NULL; /* EOF */
408 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000409#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410}
411
412/* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
414
415 This function is called from check_bom and check_coding_spec.
416
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
419
420 Return 1 on success, 0 on failure. */
421
422static int
423fp_setreadl(struct tok_state *tok, const char* enc)
424{
425 PyObject *reader, *stream, *readline;
426
Martin v. Löwis95292d62002-12-11 14:04:59 +0000427 /* XXX: constify filename argument. */
428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000429 if (stream == NULL)
430 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000431
432 reader = PyCodec_StreamReader(enc, stream, NULL);
433 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000434 if (reader == NULL)
435 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
437 readline = PyObject_GetAttrString(reader, "readline");
438 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000439 if (readline == NULL)
440 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441
442 tok->decoding_readline = readline;
443 return 1;
444}
445
446/* Fetch the next byte from TOK. */
447
448static int fp_getc(struct tok_state *tok) {
449 return getc(tok->fp);
450}
451
452/* Unfetch the last byte back into TOK. */
453
454static void fp_ungetc(int c, struct tok_state *tok) {
455 ungetc(c, tok->fp);
456}
457
458/* Read a line of input from TOK. Determine encoding
459 if necessary. */
460
461static char *
462decoding_fgets(char *s, int size, struct tok_state *tok)
463{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000464 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000466 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467 if (tok->decoding_state < 0) {
468 /* We already have a codec associated with
469 this input. */
470 line = fp_readl(s, size, tok);
471 break;
472 } else if (tok->decoding_state > 0) {
473 /* We want a 'raw' read. */
474 line = Py_UniversalNewlineFgets(s, size,
475 tok->fp, NULL);
476 warn = 1;
477 break;
478 } else {
479 /* We have not yet determined the encoding.
480 If an encoding is found, use the file-pointer
481 reader functions from now on. */
482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483 return error_ret(tok);
484 assert(tok->decoding_state != 0);
485 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000486 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489 return error_ret(tok);
490 }
491 }
492#ifndef PGEN
493 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
494 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000495 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (*c > 127) {
497 badchar = *c;
498 break;
499 }
500 }
501 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000502 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000503 /* Need to add 1 to the line number, since this line
504 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000505 sprintf(buf,
506 "Non-ASCII character '\\x%.2x' "
507 "in file %.200s on line %i, "
508 "but no encoding declared; "
509 "see http://www.python.org/peps/pep-0263.html for details",
510 badchar, tok->filename, tok->lineno + 1);
511 /* We don't use PyErr_WarnExplicit() here because
512 printing the line in question to e.g. a log file
513 could result in sensitive information being
514 exposed. */
515 PyErr_Warn(PyExc_DeprecationWarning, buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516 tok->issued_encoding_warning = 1;
517 }
518#endif
519 return line;
520}
521
522static int
523decoding_feof(struct tok_state *tok)
524{
525 if (tok->decoding_state >= 0) {
526 return feof(tok->fp);
527 } else {
528 PyObject* buf = tok->decoding_buffer;
529 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000530 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531 if (buf == NULL) {
532 error_ret(tok);
533 return 1;
534 } else {
535 tok->decoding_buffer = buf;
536 }
537 }
538 return PyObject_Length(buf) == 0;
539 }
540}
541
542/* Fetch a byte from TOK, using the string buffer. */
543
544static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000545 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546}
547
548/* Unfetch a byte from TOK, using the string buffer. */
549
550static void buf_ungetc(int c, struct tok_state *tok) {
551 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Set the readline function for TOK to ENC. For the string-based
556 tokenizer, this means to just record the encoding. */
557
558static int buf_setreadl(struct tok_state *tok, const char* enc) {
559 tok->enc = enc;
560 return 1;
561}
562
563/* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
565
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567static PyObject *
568translate_into_utf8(const char* str, const char* enc) {
569 PyObject *utf8;
570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571 if (buf == NULL)
572 return NULL;
573 utf8 = PyUnicode_AsUTF8String(buf);
574 Py_DECREF(buf);
575 return utf8;
576}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000577#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578
579/* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
581 inside TOK. */
582
583static const char *
584decode_str(const char *str, struct tok_state *tok)
585{
586 PyObject* utf8 = NULL;
587 const char *s;
588 int lineno = 0;
589 tok->enc = NULL;
590 tok->str = str;
591 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000592 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000594 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000595#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 if (tok->enc != NULL) {
597 utf8 = translate_into_utf8(str, tok->enc);
598 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000599 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600 str = PyString_AsString(utf8);
601 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000602#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 for (s = str;; s++) {
604 if (*s == '\0') break;
605 else if (*s == '\n') {
606 lineno++;
607 if (lineno == 2) break;
608 }
609 }
610 tok->enc = NULL;
611 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000612 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000613#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614 if (tok->enc != NULL) {
615 assert(utf8 == NULL);
616 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000617 if (utf8 == NULL) {
618 PyErr_Format(PyExc_SyntaxError,
619 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000620 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000621 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 str = PyString_AsString(utf8);
623 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000624#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 assert(tok->decoding_buffer == NULL);
626 tok->decoding_buffer = utf8; /* CAUTION */
627 return str;
628}
629
630#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631
632/* Set up tokenizer for string */
633
634struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000635PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636{
637 struct tok_state *tok = tok_new();
638 if (tok == NULL)
639 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000641 if (str == NULL) {
642 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000644 }
645
Martin v. Löwis95292d62002-12-11 14:04:59 +0000646 /* XXX: constify members. */
647 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648 return tok;
649}
650
651
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000652/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653
654struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000655PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656{
657 struct tok_state *tok = tok_new();
658 if (tok == NULL)
659 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000660 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000661 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000662 return NULL;
663 }
664 tok->cur = tok->inp = tok->buf;
665 tok->end = tok->buf + BUFSIZ;
666 tok->fp = fp;
667 tok->prompt = ps1;
668 tok->nextprompt = ps2;
669 return tok;
670}
671
672
673/* Free a tok_state structure */
674
675void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000676PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000678 if (tok->encoding != NULL)
679 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000680#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000681 Py_XDECREF(tok->decoding_readline);
682 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000683#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000685 PyMem_DEL(tok->buf);
686 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687}
688
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000689#if !defined(PGEN) && defined(Py_USING_UNICODE)
690static int
691tok_stdin_decode(struct tok_state *tok, char **inp)
692{
693 PyObject *enc, *sysstdin, *decoded, *utf8;
694 const char *encoding;
695 char *converted;
696
697 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698 return 0;
699 sysstdin = PySys_GetObject("stdin");
700 if (sysstdin == NULL || !PyFile_Check(sysstdin))
701 return 0;
702
703 enc = ((PyFileObject *)sysstdin)->f_encoding;
704 if (enc == NULL || !PyString_Check(enc))
705 return 0;
706 Py_INCREF(enc);
707
708 encoding = PyString_AsString(enc);
709 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
710 if (decoded == NULL)
711 goto error_clear;
712
713 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714 Py_DECREF(decoded);
715 if (utf8 == NULL)
716 goto error_clear;
717
718 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
719 Py_DECREF(utf8);
720 if (converted == NULL)
721 goto error_nomem;
722
723 PyMem_FREE(*inp);
724 *inp = converted;
725 if (tok->encoding != NULL)
726 PyMem_DEL(tok->encoding);
727 tok->encoding = new_string(encoding, strlen(encoding));
728 if (tok->encoding == NULL)
729 goto error_nomem;
730
731 Py_DECREF(enc);
732 return 0;
733
734error_nomem:
735 Py_DECREF(enc);
736 tok->done = E_NOMEM;
737 return -1;
738
739error_clear:
740 /* Fallback to iso-8859-1: for backward compatibility */
741 Py_DECREF(enc);
742 PyErr_Clear();
743 return 0;
744}
745#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746
747/* Get next char, updating state; error code goes into tok->done */
748
749static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000750tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000753 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000754 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000755 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000756 if (tok->done != E_OK)
757 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000759 char *end = strchr(tok->inp, '\n');
760 if (end != NULL)
761 end++;
762 else {
763 end = strchr(tok->inp, '\0');
764 if (end == tok->inp) {
765 tok->done = E_EOF;
766 return EOF;
767 }
768 }
769 if (tok->start == NULL)
770 tok->buf = tok->cur;
771 tok->lineno++;
772 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000773 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000776 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 if (tok->nextprompt != NULL)
778 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 if (new == NULL)
780 tok->done = E_INTR;
781 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000782 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 tok->done = E_EOF;
784 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000785#if !defined(PGEN) && defined(Py_USING_UNICODE)
786 else if (tok_stdin_decode(tok, &new) != 0)
787 PyMem_FREE(new);
788#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000789 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000790 size_t start = tok->start - tok->buf;
791 size_t oldlen = tok->cur - tok->buf;
792 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000793 char *buf = tok->buf;
794 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000795 tok->lineno++;
796 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000797 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000798 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000799 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000800 tok->done = E_NOMEM;
801 return EOF;
802 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 tok->buf = buf;
804 tok->cur = tok->buf + oldlen;
805 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000806 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 tok->inp = tok->buf + newlen;
808 tok->end = tok->inp + 1;
809 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000810 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 else {
812 tok->lineno++;
813 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000814 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000815 tok->buf = new;
816 tok->cur = tok->buf;
817 tok->inp = strchr(tok->buf, '\0');
818 tok->end = tok->inp + 1;
819 }
820 }
821 else {
822 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000823 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000824 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000825 if (tok->start == NULL) {
826 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000827 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000828 if (tok->buf == NULL) {
829 tok->done = E_NOMEM;
830 return EOF;
831 }
832 tok->end = tok->buf + BUFSIZ;
833 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000834 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
835 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 tok->done = E_EOF;
837 done = 1;
838 }
839 else {
840 tok->done = E_OK;
841 tok->inp = strchr(tok->buf, '\0');
842 done = tok->inp[-1] == '\n';
843 }
844 }
845 else {
846 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000847 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000848 tok->done = E_EOF;
849 done = 1;
850 }
851 else
852 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000853 }
854 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000855 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000857 Py_ssize_t curstart = tok->start == NULL ? -1 :
858 tok->start - tok->buf;
859 Py_ssize_t curvalid = tok->inp - tok->buf;
860 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000861 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000862 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 if (newbuf == NULL) {
864 tok->done = E_NOMEM;
865 tok->cur = tok->inp;
866 return EOF;
867 }
868 tok->buf = newbuf;
869 tok->inp = tok->buf + curvalid;
870 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000871 tok->start = curstart < 0 ? NULL :
872 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000873 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000874 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000875 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000876 /* Last line does not end in \n,
877 fake one */
878 strcpy(tok->inp, "\n");
879 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000880 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000881 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000883 tok->cur = tok->buf + cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000884 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000885 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000886 pt = tok->inp - 2;
887 if (pt >= tok->buf && *pt == '\r') {
888 *pt++ = '\n';
889 *pt = '\0';
890 tok->inp = pt;
891 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000892 }
893 if (tok->done != E_OK) {
894 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000895 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000897 return EOF;
898 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000899 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000901}
902
903
904/* Back-up one character */
905
906static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000907tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908{
909 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000910 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000911 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 if (*tok->cur != c)
913 *tok->cur = c;
914 }
915}
916
917
918/* Return the token corresponding to a single character */
919
920int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000921PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000922{
923 switch (c) {
924 case '(': return LPAR;
925 case ')': return RPAR;
926 case '[': return LSQB;
927 case ']': return RSQB;
928 case ':': return COLON;
929 case ',': return COMMA;
930 case ';': return SEMI;
931 case '+': return PLUS;
932 case '-': return MINUS;
933 case '*': return STAR;
934 case '/': return SLASH;
935 case '|': return VBAR;
936 case '&': return AMPER;
937 case '<': return LESS;
938 case '>': return GREATER;
939 case '=': return EQUAL;
940 case '.': return DOT;
941 case '%': return PERCENT;
942 case '`': return BACKQUOTE;
943 case '{': return LBRACE;
944 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000945 case '^': return CIRCUMFLEX;
946 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000947 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000948 default: return OP;
949 }
950}
951
952
Guido van Rossumfbab9051991-10-20 20:25:03 +0000953int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000954PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000955{
956 switch (c1) {
957 case '=':
958 switch (c2) {
959 case '=': return EQEQUAL;
960 }
961 break;
962 case '!':
963 switch (c2) {
964 case '=': return NOTEQUAL;
965 }
966 break;
967 case '<':
968 switch (c2) {
969 case '>': return NOTEQUAL;
970 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000971 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000972 }
973 break;
974 case '>':
975 switch (c2) {
976 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000977 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000978 }
979 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000980 case '+':
981 switch (c2) {
982 case '=': return PLUSEQUAL;
983 }
984 break;
985 case '-':
986 switch (c2) {
987 case '=': return MINEQUAL;
988 }
989 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000990 case '*':
991 switch (c2) {
992 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000993 case '=': return STAREQUAL;
994 }
995 break;
996 case '/':
997 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000998 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000999 case '=': return SLASHEQUAL;
1000 }
1001 break;
1002 case '|':
1003 switch (c2) {
1004 case '=': return VBAREQUAL;
1005 }
1006 break;
1007 case '%':
1008 switch (c2) {
1009 case '=': return PERCENTEQUAL;
1010 }
1011 break;
1012 case '&':
1013 switch (c2) {
1014 case '=': return AMPEREQUAL;
1015 }
1016 break;
1017 case '^':
1018 switch (c2) {
1019 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001020 }
1021 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001022 }
1023 return OP;
1024}
1025
Thomas Wouters434d0822000-08-24 20:11:32 +00001026int
1027PyToken_ThreeChars(int c1, int c2, int c3)
1028{
1029 switch (c1) {
1030 case '<':
1031 switch (c2) {
1032 case '<':
1033 switch (c3) {
1034 case '=':
1035 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001036 }
1037 break;
1038 }
1039 break;
1040 case '>':
1041 switch (c2) {
1042 case '>':
1043 switch (c3) {
1044 case '=':
1045 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001046 }
1047 break;
1048 }
1049 break;
1050 case '*':
1051 switch (c2) {
1052 case '*':
1053 switch (c3) {
1054 case '=':
1055 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001056 }
1057 break;
1058 }
1059 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001060 case '/':
1061 switch (c2) {
1062 case '/':
1063 switch (c3) {
1064 case '=':
1065 return DOUBLESLASHEQUAL;
1066 }
1067 break;
1068 }
1069 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001070 }
1071 return OP;
1072}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001073
Guido van Rossum926f13a1998-04-09 21:38:06 +00001074static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001075indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001076{
1077 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001078 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001079 tok->cur = tok->inp;
1080 return 1;
1081 }
1082 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001083 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1084 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001085 tok->altwarning = 0;
1086 }
1087 return 0;
1088}
1089
1090
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091/* Get next token, after space stripping etc. */
1092
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001093static int
1094tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001095{
1096 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001097 int blankline;
1098
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001099 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001100 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001101 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001102 blankline = 0;
1103
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104 /* Get indentation level */
1105 if (tok->atbol) {
1106 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001107 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109 for (;;) {
1110 c = tok_nextc(tok);
1111 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112 col++, altcol++;
1113 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001115 altcol = (altcol/tok->alttabsize + 1)
1116 * tok->alttabsize;
1117 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001118 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001119 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 else
1121 break;
1122 }
1123 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001124 if (c == '#' || c == '\n') {
1125 /* Lines with only whitespace and/or comments
1126 shouldn't affect the indentation and are
1127 not passed to the parser as NEWLINE tokens,
1128 except *totally* empty lines in interactive
1129 mode, which signal the end of a command group. */
1130 if (col == 0 && c == '\n' && tok->prompt != NULL)
1131 blankline = 0; /* Let it through */
1132 else
1133 blankline = 1; /* Ignore completely */
1134 /* We can't jump back right here since we still
1135 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001136 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001137 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001138 if (col == tok->indstack[tok->indent]) {
1139 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001140 if (altcol != tok->altindstack[tok->indent]) {
1141 if (indenterror(tok))
1142 return ERRORTOKEN;
1143 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001145 else if (col > tok->indstack[tok->indent]) {
1146 /* Indent -- always one */
1147 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001148 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001149 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001150 return ERRORTOKEN;
1151 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001152 if (altcol <= tok->altindstack[tok->indent]) {
1153 if (indenterror(tok))
1154 return ERRORTOKEN;
1155 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001156 tok->pendin++;
1157 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001158 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001160 else /* col < tok->indstack[tok->indent] */ {
1161 /* Dedent -- any number, must be consistent */
1162 while (tok->indent > 0 &&
1163 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001165 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001166 }
1167 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001168 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001169 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001170 return ERRORTOKEN;
1171 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001172 if (altcol != tok->altindstack[tok->indent]) {
1173 if (indenterror(tok))
1174 return ERRORTOKEN;
1175 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 }
1177 }
1178 }
1179
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001180 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001181
1182 /* Return pending indents/dedents */
1183 if (tok->pendin != 0) {
1184 if (tok->pendin < 0) {
1185 tok->pendin++;
1186 return DEDENT;
1187 }
1188 else {
1189 tok->pendin--;
1190 return INDENT;
1191 }
1192 }
1193
1194 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001195 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001196 /* Skip spaces */
1197 do {
1198 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001199 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200
1201 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001202 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203
Guido van Rossumab5ca152000-03-31 00:52:27 +00001204 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001206 static char *tabforms[] = {
1207 "tab-width:", /* Emacs */
1208 ":tabstop=", /* vim, full form */
1209 ":ts=", /* vim, abbreviated form */
1210 "set tabsize=", /* will vi never die? */
1211 /* more templates can be added here to support other editors */
1212 };
1213 char cbuf[80];
1214 char *tp, **cp;
1215 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001217 *tp++ = c = tok_nextc(tok);
1218 } while (c != EOF && c != '\n' &&
1219 tp - cbuf + 1 < sizeof(cbuf));
1220 *tp = '\0';
1221 for (cp = tabforms;
1222 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1223 cp++) {
1224 if ((tp = strstr(cbuf, *cp))) {
1225 int newsize = atoi(tp + strlen(*cp));
1226
1227 if (newsize >= 1 && newsize <= 40) {
1228 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001229 if (Py_VerboseFlag)
1230 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001231 "Tab size set to %d\n",
1232 newsize);
1233 }
1234 }
1235 }
1236 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001238 }
1239
1240 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001241 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001242 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001243 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244
1245 /* Identifier (most frequent token!) */
1246 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001247 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001248 switch (c) {
1249 case 'r':
1250 case 'R':
1251 c = tok_nextc(tok);
1252 if (c == '"' || c == '\'')
1253 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001254 break;
1255 case 'u':
1256 case 'U':
1257 c = tok_nextc(tok);
1258 if (c == 'r' || c == 'R')
1259 c = tok_nextc(tok);
1260 if (c == '"' || c == '\'')
1261 goto letter_quote;
1262 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001263 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001264 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001265 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001266 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001268 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 *p_end = tok->cur;
1270 return NAME;
1271 }
1272
1273 /* Newline */
1274 if (c == '\n') {
1275 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001276 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001277 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001278 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001280 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 return NEWLINE;
1282 }
1283
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001284 /* Period or number starting with period? */
1285 if (c == '.') {
1286 c = tok_nextc(tok);
1287 if (isdigit(c)) {
1288 goto fraction;
1289 }
1290 else {
1291 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001292 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001293 *p_end = tok->cur;
1294 return DOT;
1295 }
1296 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001297
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 /* Number */
1299 if (isdigit(c)) {
1300 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001301 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 c = tok_nextc(tok);
1303 if (c == '.')
1304 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001305#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001306 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001307 goto imaginary;
1308#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 if (c == 'x' || c == 'X') {
1310 /* Hex */
1311 do {
1312 c = tok_nextc(tok);
1313 } while (isxdigit(c));
1314 }
1315 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001316 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 /* Octal; c is first char of it */
1318 /* There's no 'isoctdigit' macro, sigh */
1319 while ('0' <= c && c < '8') {
1320 c = tok_nextc(tok);
1321 }
Tim Petersd507dab2001-08-30 20:51:59 +00001322 if (isdigit(c)) {
1323 found_decimal = 1;
1324 do {
1325 c = tok_nextc(tok);
1326 } while (isdigit(c));
1327 }
1328 if (c == '.')
1329 goto fraction;
1330 else if (c == 'e' || c == 'E')
1331 goto exponent;
1332#ifndef WITHOUT_COMPLEX
1333 else if (c == 'j' || c == 'J')
1334 goto imaginary;
1335#endif
1336 else if (found_decimal) {
1337 tok->done = E_TOKEN;
1338 tok_backup(tok, c);
1339 return ERRORTOKEN;
1340 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001342 if (c == 'l' || c == 'L')
1343 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 }
1345 else {
1346 /* Decimal */
1347 do {
1348 c = tok_nextc(tok);
1349 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001350 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001352 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001353 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001354 if (c == '.') {
1355 fraction:
1356 /* Fraction */
1357 do {
1358 c = tok_nextc(tok);
1359 } while (isdigit(c));
1360 }
1361 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001362 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001363 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001365 if (c == '+' || c == '-')
1366 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001367 if (!isdigit(c)) {
1368 tok->done = E_TOKEN;
1369 tok_backup(tok, c);
1370 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001371 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001372 do {
1373 c = tok_nextc(tok);
1374 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001376#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001377 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001378 /* Imaginary part */
1379 imaginary:
1380 c = tok_nextc(tok);
1381#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001382 }
1383 }
1384 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001386 *p_end = tok->cur;
1387 return NUMBER;
1388 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001389
1390 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001391 /* String */
1392 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001393 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001394 int quote = c;
1395 int triple = 0;
1396 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 for (;;) {
1398 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 if (c == '\n') {
1400 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001401 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001402 tok_backup(tok, c);
1403 return ERRORTOKEN;
1404 }
1405 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001406 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 }
1408 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001409 if (triple)
1410 tok->done = E_EOFS;
1411 else
1412 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001413 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 return ERRORTOKEN;
1415 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001416 else if (c == quote) {
1417 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001418 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001419 c = tok_nextc(tok);
1420 if (c == quote) {
1421 triple = 1;
1422 tripcount = 0;
1423 continue;
1424 }
1425 tok_backup(tok, c);
1426 }
1427 if (!triple || tripcount == 3)
1428 break;
1429 }
1430 else if (c == '\\') {
1431 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001432 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001433 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001434 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001435 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001436 return ERRORTOKEN;
1437 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001439 else
1440 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001442 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001443 *p_end = tok->cur;
1444 return STRING;
1445 }
1446
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001447 /* Line continuation */
1448 if (c == '\\') {
1449 c = tok_nextc(tok);
1450 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001451 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001452 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001453 return ERRORTOKEN;
1454 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001455 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 goto again; /* Read next line */
1457 }
1458
Guido van Rossumfbab9051991-10-20 20:25:03 +00001459 /* Check for two-character token */
1460 {
1461 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001462 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001463 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001464 int c3 = tok_nextc(tok);
1465 int token3 = PyToken_ThreeChars(c, c2, c3);
1466 if (token3 != OP) {
1467 token = token3;
1468 } else {
1469 tok_backup(tok, c3);
1470 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001471 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001472 *p_end = tok->cur;
1473 return token;
1474 }
1475 tok_backup(tok, c2);
1476 }
1477
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001478 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001479 switch (c) {
1480 case '(':
1481 case '[':
1482 case '{':
1483 tok->level++;
1484 break;
1485 case ')':
1486 case ']':
1487 case '}':
1488 tok->level--;
1489 break;
1490 }
1491
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001493 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001494 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001495 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001496}
1497
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001498int
1499PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1500{
1501 int result = tok_get(tok, p_start, p_end);
1502 if (tok->decoding_erred) {
1503 result = ERRORTOKEN;
1504 tok->done = E_DECODE;
1505 }
1506 return result;
1507}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001508
Guido van Rossum408027e1996-12-30 16:17:54 +00001509#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510
1511void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001512tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513{
Guido van Rossum86bea461997-04-29 21:03:06 +00001514 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001515 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1516 printf("(%.*s)", (int)(end - start), start);
1517}
1518
1519#endif