blob: 37e6c3349bac6d48917b64dc77574e7a95f81fad [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Guido van Rossum86bea461997-04-29 21:03:06 +0000108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
130 tok->issued_encoding_warning = 0;
131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_DEL(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
173new_string(const char *s, int len)
174{
175 char* result = PyMem_NEW(char, len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
209get_coding_spec(const char *s, int size)
210{
211 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 while (isalnum((int)t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 PyMem_DEL(r);
241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
256check_coding_spec(const char* line, int size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
281 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000287 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
292 PyMem_DEL(cs);
293 }
294 }
295 return r;
296}
297
298/* See whether the file starts with a BOM. If it does,
299 invoke the set_readline function with the new encoding.
300 Return 1 on success, 0 on failure. */
301
302static int
303check_bom(int get_char(struct tok_state *),
304 void unget_char(int, struct tok_state *),
305 int set_readline(struct tok_state *, const char *),
306 struct tok_state *tok)
307{
308 int ch = get_char(tok);
309 tok->decoding_state = 1;
310 if (ch == EOF) {
311 return 1;
312 } else if (ch == 0xEF) {
313 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
314 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
315#if 0
316 /* Disable support for UTF-16 BOMs until a decision
317 is made whether this needs to be supported. */
318 } else if (ch == 0xFE) {
319 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-be")) return 0;
321 tok->decoding_state = -1;
322 } else if (ch == 0xFF) {
323 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
324 if (!set_readline(tok, "utf-16-le")) return 0;
325 tok->decoding_state = -1;
326#endif
327 } else {
328 unget_char(ch, tok);
329 return 1;
330 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000331 if (tok->encoding != NULL)
332 PyMem_DEL(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
334 return 1;
335 NON_BOM:
336 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
337 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
338 return 1;
339}
340
341/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000342 Return NULL on failure, else S.
343
344 On entry, tok->decoding_buffer will be one of:
345 1) NULL: need to call tok->decoding_readline to get a new line
346 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
347 stored the result in tok->decoding_buffer
348 3) PyStringObject *: previous call to fp_readl did not have enough room
349 (in the s buffer) to copy entire contents of the line read
350 by tok->decoding_readline. tok->decoding_buffer has the overflow.
351 In this case, fp_readl is called in a loop (with an expanded buffer)
352 until the buffer ends with a '\n' (or until the end of the file is
353 reached): see tok_nextc and its calls to decoding_fgets.
354*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000355
356static char *
357fp_readl(char *s, int size, struct tok_state *tok)
358{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000359#ifndef Py_USING_UNICODE
360 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000361 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000362 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000363#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 char *str;
367 int utf8len;
368
369 /* Ask for one less byte so we can terminate it */
370 assert(size > 0);
371 size--;
372
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000375 if (buf == NULL)
376 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000377 } else {
378 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 if (PyString_CheckExact(buf))
380 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000382 if (utf8 == NULL) {
383 utf8 = PyUnicode_AsUTF8String(buf);
384 Py_DECREF(buf);
385 if (utf8 == NULL)
386 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 str = PyString_AsString(utf8);
389 utf8len = PyString_GET_SIZE(utf8);
390 if (utf8len > size) {
391 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
392 if (tok->decoding_buffer == NULL) {
393 Py_DECREF(utf8);
394 return error_ret(tok);
395 }
396 utf8len = size;
397 }
398 memcpy(s, str, utf8len);
399 s[utf8len] = '\0';
400 Py_DECREF(utf8);
401 if (utf8len == 0) return NULL; /* EOF */
402 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000403#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404}
405
406/* Set the readline function for TOK to a StreamReader's
407 readline function. The StreamReader is named ENC.
408
409 This function is called from check_bom and check_coding_spec.
410
411 ENC is usually identical to the future value of tok->encoding,
412 except for the (currently unsupported) case of UTF-16.
413
414 Return 1 on success, 0 on failure. */
415
416static int
417fp_setreadl(struct tok_state *tok, const char* enc)
418{
419 PyObject *reader, *stream, *readline;
420
Martin v. Löwis95292d62002-12-11 14:04:59 +0000421 /* XXX: constify filename argument. */
422 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000423 if (stream == NULL)
424 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426 reader = PyCodec_StreamReader(enc, stream, NULL);
427 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 if (reader == NULL)
429 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430
431 readline = PyObject_GetAttrString(reader, "readline");
432 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000433 if (readline == NULL)
434 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000435
436 tok->decoding_readline = readline;
437 return 1;
438}
439
440/* Fetch the next byte from TOK. */
441
442static int fp_getc(struct tok_state *tok) {
443 return getc(tok->fp);
444}
445
446/* Unfetch the last byte back into TOK. */
447
448static void fp_ungetc(int c, struct tok_state *tok) {
449 ungetc(c, tok->fp);
450}
451
452/* Read a line of input from TOK. Determine encoding
453 if necessary. */
454
455static char *
456decoding_fgets(char *s, int size, struct tok_state *tok)
457{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000458 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000460 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000461 if (tok->decoding_state < 0) {
462 /* We already have a codec associated with
463 this input. */
464 line = fp_readl(s, size, tok);
465 break;
466 } else if (tok->decoding_state > 0) {
467 /* We want a 'raw' read. */
468 line = Py_UniversalNewlineFgets(s, size,
469 tok->fp, NULL);
470 warn = 1;
471 break;
472 } else {
473 /* We have not yet determined the encoding.
474 If an encoding is found, use the file-pointer
475 reader functions from now on. */
476 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
477 return error_ret(tok);
478 assert(tok->decoding_state != 0);
479 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000480 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
482 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
483 return error_ret(tok);
484 }
485 }
486#ifndef PGEN
487 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
488 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000489 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000490 if (*c > 127) {
491 badchar = *c;
492 break;
493 }
494 }
495 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000496 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000499 sprintf(buf,
500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
503 "see http://www.python.org/peps/pep-0263.html for details",
504 badchar, tok->filename, tok->lineno + 1);
505 /* We don't use PyErr_WarnExplicit() here because
506 printing the line in question to e.g. a log file
507 could result in sensitive information being
508 exposed. */
509 PyErr_Warn(PyExc_DeprecationWarning, buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510 tok->issued_encoding_warning = 1;
511 }
512#endif
513 return line;
514}
515
516static int
517decoding_feof(struct tok_state *tok)
518{
519 if (tok->decoding_state >= 0) {
520 return feof(tok->fp);
521 } else {
522 PyObject* buf = tok->decoding_buffer;
523 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000524 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000525 if (buf == NULL) {
526 error_ret(tok);
527 return 1;
528 } else {
529 tok->decoding_buffer = buf;
530 }
531 }
532 return PyObject_Length(buf) == 0;
533 }
534}
535
536/* Fetch a byte from TOK, using the string buffer. */
537
538static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000539 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540}
541
542/* Unfetch a byte from TOK, using the string buffer. */
543
544static void buf_ungetc(int c, struct tok_state *tok) {
545 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000546 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547}
548
549/* Set the readline function for TOK to ENC. For the string-based
550 tokenizer, this means to just record the encoding. */
551
552static int buf_setreadl(struct tok_state *tok, const char* enc) {
553 tok->enc = enc;
554 return 1;
555}
556
557/* Return a UTF-8 encoding Python string object from the
558 C byte string STR, which is encoded with ENC. */
559
Martin v. Löwis019934b2002-08-07 12:33:18 +0000560#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000561static PyObject *
562translate_into_utf8(const char* str, const char* enc) {
563 PyObject *utf8;
564 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
565 if (buf == NULL)
566 return NULL;
567 utf8 = PyUnicode_AsUTF8String(buf);
568 Py_DECREF(buf);
569 return utf8;
570}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000571#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572
573/* Decode a byte string STR for use as the buffer of TOK.
574 Look for encoding declarations inside STR, and record them
575 inside TOK. */
576
577static const char *
578decode_str(const char *str, struct tok_state *tok)
579{
580 PyObject* utf8 = NULL;
581 const char *s;
582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000586 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000588 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000589#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 str = PyString_AsString(utf8);
595 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
600 lineno++;
601 if (lineno == 2) break;
602 }
603 }
604 tok->enc = NULL;
605 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000606 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000607#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 if (tok->enc != NULL) {
609 assert(utf8 == NULL);
610 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000611 if (utf8 == NULL) {
612 PyErr_Format(PyExc_SyntaxError,
613 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000614 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000615 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000616 str = PyString_AsString(utf8);
617 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000618#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 assert(tok->decoding_buffer == NULL);
620 tok->decoding_buffer = utf8; /* CAUTION */
621 return str;
622}
623
624#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625
626/* Set up tokenizer for string */
627
628struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000629PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630{
631 struct tok_state *tok = tok_new();
632 if (tok == NULL)
633 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000634 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000635 if (str == NULL) {
636 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000638 }
639
Martin v. Löwis95292d62002-12-11 14:04:59 +0000640 /* XXX: constify members. */
641 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642 return tok;
643}
644
645
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000646/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647
648struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000649PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650{
651 struct tok_state *tok = tok_new();
652 if (tok == NULL)
653 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000654 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000655 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 return NULL;
657 }
658 tok->cur = tok->inp = tok->buf;
659 tok->end = tok->buf + BUFSIZ;
660 tok->fp = fp;
661 tok->prompt = ps1;
662 tok->nextprompt = ps2;
663 return tok;
664}
665
666
667/* Free a tok_state structure */
668
669void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000670PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000671{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000672 if (tok->encoding != NULL)
673 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000674#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675 Py_XDECREF(tok->decoding_readline);
676 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000677#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000679 PyMem_DEL(tok->buf);
680 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681}
682
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000683#if !defined(PGEN) && defined(Py_USING_UNICODE)
684static int
685tok_stdin_decode(struct tok_state *tok, char **inp)
686{
687 PyObject *enc, *sysstdin, *decoded, *utf8;
688 const char *encoding;
689 char *converted;
690
691 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
692 return 0;
693 sysstdin = PySys_GetObject("stdin");
694 if (sysstdin == NULL || !PyFile_Check(sysstdin))
695 return 0;
696
697 enc = ((PyFileObject *)sysstdin)->f_encoding;
698 if (enc == NULL || !PyString_Check(enc))
699 return 0;
700 Py_INCREF(enc);
701
702 encoding = PyString_AsString(enc);
703 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
704 if (decoded == NULL)
705 goto error_clear;
706
707 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
708 Py_DECREF(decoded);
709 if (utf8 == NULL)
710 goto error_clear;
711
712 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
713 Py_DECREF(utf8);
714 if (converted == NULL)
715 goto error_nomem;
716
717 PyMem_FREE(*inp);
718 *inp = converted;
719 if (tok->encoding != NULL)
720 PyMem_DEL(tok->encoding);
721 tok->encoding = new_string(encoding, strlen(encoding));
722 if (tok->encoding == NULL)
723 goto error_nomem;
724
725 Py_DECREF(enc);
726 return 0;
727
728error_nomem:
729 Py_DECREF(enc);
730 tok->done = E_NOMEM;
731 return -1;
732
733error_clear:
734 /* Fallback to iso-8859-1: for backward compatibility */
735 Py_DECREF(enc);
736 PyErr_Clear();
737 return 0;
738}
739#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740
741/* Get next char, updating state; error code goes into tok->done */
742
743static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000744tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000745{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000747 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000748 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000749 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000750 if (tok->done != E_OK)
751 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000753 char *end = strchr(tok->inp, '\n');
754 if (end != NULL)
755 end++;
756 else {
757 end = strchr(tok->inp, '\0');
758 if (end == tok->inp) {
759 tok->done = E_EOF;
760 return EOF;
761 }
762 }
763 if (tok->start == NULL)
764 tok->buf = tok->cur;
765 tok->lineno++;
766 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000767 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000770 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000771 if (tok->nextprompt != NULL)
772 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000773 if (new == NULL)
774 tok->done = E_INTR;
775 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000776 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 tok->done = E_EOF;
778 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000779#if !defined(PGEN) && defined(Py_USING_UNICODE)
780 else if (tok_stdin_decode(tok, &new) != 0)
781 PyMem_FREE(new);
782#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000784 size_t start = tok->start - tok->buf;
785 size_t oldlen = tok->cur - tok->buf;
786 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000787 char *buf = tok->buf;
788 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000789 tok->lineno++;
790 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000791 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000792 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000793 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000794 tok->done = E_NOMEM;
795 return EOF;
796 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000797 tok->buf = buf;
798 tok->cur = tok->buf + oldlen;
799 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000800 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000801 tok->inp = tok->buf + newlen;
802 tok->end = tok->inp + 1;
803 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000804 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000805 else {
806 tok->lineno++;
807 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000808 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000809 tok->buf = new;
810 tok->cur = tok->buf;
811 tok->inp = strchr(tok->buf, '\0');
812 tok->end = tok->inp + 1;
813 }
814 }
815 else {
816 int done = 0;
817 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000818 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 if (tok->start == NULL) {
820 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000821 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 if (tok->buf == NULL) {
823 tok->done = E_NOMEM;
824 return EOF;
825 }
826 tok->end = tok->buf + BUFSIZ;
827 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000828 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
829 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 tok->done = E_EOF;
831 done = 1;
832 }
833 else {
834 tok->done = E_OK;
835 tok->inp = strchr(tok->buf, '\0');
836 done = tok->inp[-1] == '\n';
837 }
838 }
839 else {
840 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000841 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000842 tok->done = E_EOF;
843 done = 1;
844 }
845 else
846 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000847 }
848 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000849 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000850 while (!done) {
851 int curstart = tok->start == NULL ? -1 :
852 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000853 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000854 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000855 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000856 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000857 if (newbuf == NULL) {
858 tok->done = E_NOMEM;
859 tok->cur = tok->inp;
860 return EOF;
861 }
862 tok->buf = newbuf;
863 tok->inp = tok->buf + curvalid;
864 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 tok->start = curstart < 0 ? NULL :
866 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000867 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000868 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000869 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000870 /* Last line does not end in \n,
871 fake one */
872 strcpy(tok->inp, "\n");
873 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000874 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000875 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000877 tok->cur = tok->buf + cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000878 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000879 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000880 pt = tok->inp - 2;
881 if (pt >= tok->buf && *pt == '\r') {
882 *pt++ = '\n';
883 *pt = '\0';
884 tok->inp = pt;
885 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000886 }
887 if (tok->done != E_OK) {
888 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000889 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000890 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000891 return EOF;
892 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000893 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000895}
896
897
898/* Back-up one character */
899
900static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000901tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000902{
903 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000904 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000905 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000906 if (*tok->cur != c)
907 *tok->cur = c;
908 }
909}
910
911
912/* Return the token corresponding to a single character */
913
914int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000915PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916{
917 switch (c) {
918 case '(': return LPAR;
919 case ')': return RPAR;
920 case '[': return LSQB;
921 case ']': return RSQB;
922 case ':': return COLON;
923 case ',': return COMMA;
924 case ';': return SEMI;
925 case '+': return PLUS;
926 case '-': return MINUS;
927 case '*': return STAR;
928 case '/': return SLASH;
929 case '|': return VBAR;
930 case '&': return AMPER;
931 case '<': return LESS;
932 case '>': return GREATER;
933 case '=': return EQUAL;
934 case '.': return DOT;
935 case '%': return PERCENT;
936 case '`': return BACKQUOTE;
937 case '{': return LBRACE;
938 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000939 case '^': return CIRCUMFLEX;
940 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000941 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942 default: return OP;
943 }
944}
945
946
Guido van Rossumfbab9051991-10-20 20:25:03 +0000947int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000948PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000949{
950 switch (c1) {
951 case '=':
952 switch (c2) {
953 case '=': return EQEQUAL;
954 }
955 break;
956 case '!':
957 switch (c2) {
958 case '=': return NOTEQUAL;
959 }
960 break;
961 case '<':
962 switch (c2) {
963 case '>': return NOTEQUAL;
964 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000965 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000966 }
967 break;
968 case '>':
969 switch (c2) {
970 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000971 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000972 }
973 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000974 case '+':
975 switch (c2) {
976 case '=': return PLUSEQUAL;
977 }
978 break;
979 case '-':
980 switch (c2) {
981 case '=': return MINEQUAL;
982 }
983 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000984 case '*':
985 switch (c2) {
986 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000987 case '=': return STAREQUAL;
988 }
989 break;
990 case '/':
991 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000992 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000993 case '=': return SLASHEQUAL;
994 }
995 break;
996 case '|':
997 switch (c2) {
998 case '=': return VBAREQUAL;
999 }
1000 break;
1001 case '%':
1002 switch (c2) {
1003 case '=': return PERCENTEQUAL;
1004 }
1005 break;
1006 case '&':
1007 switch (c2) {
1008 case '=': return AMPEREQUAL;
1009 }
1010 break;
1011 case '^':
1012 switch (c2) {
1013 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001014 }
1015 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001016 }
1017 return OP;
1018}
1019
Thomas Wouters434d0822000-08-24 20:11:32 +00001020int
1021PyToken_ThreeChars(int c1, int c2, int c3)
1022{
1023 switch (c1) {
1024 case '<':
1025 switch (c2) {
1026 case '<':
1027 switch (c3) {
1028 case '=':
1029 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001030 }
1031 break;
1032 }
1033 break;
1034 case '>':
1035 switch (c2) {
1036 case '>':
1037 switch (c3) {
1038 case '=':
1039 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001040 }
1041 break;
1042 }
1043 break;
1044 case '*':
1045 switch (c2) {
1046 case '*':
1047 switch (c3) {
1048 case '=':
1049 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001050 }
1051 break;
1052 }
1053 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001054 case '/':
1055 switch (c2) {
1056 case '/':
1057 switch (c3) {
1058 case '=':
1059 return DOUBLESLASHEQUAL;
1060 }
1061 break;
1062 }
1063 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001064 }
1065 return OP;
1066}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001067
Guido van Rossum926f13a1998-04-09 21:38:06 +00001068static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001069indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001070{
1071 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001072 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001073 tok->cur = tok->inp;
1074 return 1;
1075 }
1076 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001077 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1078 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001079 tok->altwarning = 0;
1080 }
1081 return 0;
1082}
1083
1084
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001085/* Get next token, after space stripping etc. */
1086
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001087static int
1088tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089{
1090 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001091 int blankline;
1092
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001093 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001094 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001095 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001096 blankline = 0;
1097
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098 /* Get indentation level */
1099 if (tok->atbol) {
1100 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001101 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103 for (;;) {
1104 c = tok_nextc(tok);
1105 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001106 col++, altcol++;
1107 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001109 altcol = (altcol/tok->alttabsize + 1)
1110 * tok->alttabsize;
1111 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001112 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001113 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114 else
1115 break;
1116 }
1117 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 if (c == '#' || c == '\n') {
1119 /* Lines with only whitespace and/or comments
1120 shouldn't affect the indentation and are
1121 not passed to the parser as NEWLINE tokens,
1122 except *totally* empty lines in interactive
1123 mode, which signal the end of a command group. */
1124 if (col == 0 && c == '\n' && tok->prompt != NULL)
1125 blankline = 0; /* Let it through */
1126 else
1127 blankline = 1; /* Ignore completely */
1128 /* We can't jump back right here since we still
1129 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001131 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001132 if (col == tok->indstack[tok->indent]) {
1133 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001134 if (altcol != tok->altindstack[tok->indent]) {
1135 if (indenterror(tok))
1136 return ERRORTOKEN;
1137 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001139 else if (col > tok->indstack[tok->indent]) {
1140 /* Indent -- always one */
1141 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001142 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001143 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001144 return ERRORTOKEN;
1145 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001146 if (altcol <= tok->altindstack[tok->indent]) {
1147 if (indenterror(tok))
1148 return ERRORTOKEN;
1149 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001150 tok->pendin++;
1151 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001152 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154 else /* col < tok->indstack[tok->indent] */ {
1155 /* Dedent -- any number, must be consistent */
1156 while (tok->indent > 0 &&
1157 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001159 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001160 }
1161 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001162 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001163 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 return ERRORTOKEN;
1165 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001166 if (altcol != tok->altindstack[tok->indent]) {
1167 if (indenterror(tok))
1168 return ERRORTOKEN;
1169 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001170 }
1171 }
1172 }
1173
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001174 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175
1176 /* Return pending indents/dedents */
1177 if (tok->pendin != 0) {
1178 if (tok->pendin < 0) {
1179 tok->pendin++;
1180 return DEDENT;
1181 }
1182 else {
1183 tok->pendin--;
1184 return INDENT;
1185 }
1186 }
1187
1188 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001189 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001190 /* Skip spaces */
1191 do {
1192 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001193 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194
1195 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001196 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197
Guido van Rossumab5ca152000-03-31 00:52:27 +00001198 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001200 static char *tabforms[] = {
1201 "tab-width:", /* Emacs */
1202 ":tabstop=", /* vim, full form */
1203 ":ts=", /* vim, abbreviated form */
1204 "set tabsize=", /* will vi never die? */
1205 /* more templates can be added here to support other editors */
1206 };
1207 char cbuf[80];
1208 char *tp, **cp;
1209 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001210 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001211 *tp++ = c = tok_nextc(tok);
1212 } while (c != EOF && c != '\n' &&
1213 tp - cbuf + 1 < sizeof(cbuf));
1214 *tp = '\0';
1215 for (cp = tabforms;
1216 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1217 cp++) {
1218 if ((tp = strstr(cbuf, *cp))) {
1219 int newsize = atoi(tp + strlen(*cp));
1220
1221 if (newsize >= 1 && newsize <= 40) {
1222 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001223 if (Py_VerboseFlag)
1224 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001225 "Tab size set to %d\n",
1226 newsize);
1227 }
1228 }
1229 }
1230 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 }
1233
1234 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001235 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001236 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001237 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001238
1239 /* Identifier (most frequent token!) */
1240 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001241 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001242 switch (c) {
1243 case 'r':
1244 case 'R':
1245 c = tok_nextc(tok);
1246 if (c == '"' || c == '\'')
1247 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001248 break;
1249 case 'u':
1250 case 'U':
1251 c = tok_nextc(tok);
1252 if (c == 'r' || c == 'R')
1253 c = tok_nextc(tok);
1254 if (c == '"' || c == '\'')
1255 goto letter_quote;
1256 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001257 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001258 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001260 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001262 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 *p_end = tok->cur;
1264 return NAME;
1265 }
1266
1267 /* Newline */
1268 if (c == '\n') {
1269 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001270 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001271 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001272 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001274 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 return NEWLINE;
1276 }
1277
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001278 /* Period or number starting with period? */
1279 if (c == '.') {
1280 c = tok_nextc(tok);
1281 if (isdigit(c)) {
1282 goto fraction;
1283 }
1284 else {
1285 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001286 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001287 *p_end = tok->cur;
1288 return DOT;
1289 }
1290 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001291
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 /* Number */
1293 if (isdigit(c)) {
1294 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001295 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 c = tok_nextc(tok);
1297 if (c == '.')
1298 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001299#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001300 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001301 goto imaginary;
1302#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001303 if (c == 'x' || c == 'X') {
1304 /* Hex */
1305 do {
1306 c = tok_nextc(tok);
1307 } while (isxdigit(c));
1308 }
1309 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001310 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001311 /* Octal; c is first char of it */
1312 /* There's no 'isoctdigit' macro, sigh */
1313 while ('0' <= c && c < '8') {
1314 c = tok_nextc(tok);
1315 }
Tim Petersd507dab2001-08-30 20:51:59 +00001316 if (isdigit(c)) {
1317 found_decimal = 1;
1318 do {
1319 c = tok_nextc(tok);
1320 } while (isdigit(c));
1321 }
1322 if (c == '.')
1323 goto fraction;
1324 else if (c == 'e' || c == 'E')
1325 goto exponent;
1326#ifndef WITHOUT_COMPLEX
1327 else if (c == 'j' || c == 'J')
1328 goto imaginary;
1329#endif
1330 else if (found_decimal) {
1331 tok->done = E_TOKEN;
1332 tok_backup(tok, c);
1333 return ERRORTOKEN;
1334 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001336 if (c == 'l' || c == 'L')
1337 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 }
1339 else {
1340 /* Decimal */
1341 do {
1342 c = tok_nextc(tok);
1343 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001344 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001346 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001347 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001348 if (c == '.') {
1349 fraction:
1350 /* Fraction */
1351 do {
1352 c = tok_nextc(tok);
1353 } while (isdigit(c));
1354 }
1355 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001356 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001357 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001359 if (c == '+' || c == '-')
1360 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001361 if (!isdigit(c)) {
1362 tok->done = E_TOKEN;
1363 tok_backup(tok, c);
1364 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001365 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001366 do {
1367 c = tok_nextc(tok);
1368 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001370#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001371 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001372 /* Imaginary part */
1373 imaginary:
1374 c = tok_nextc(tok);
1375#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 }
1377 }
1378 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001379 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 *p_end = tok->cur;
1381 return NUMBER;
1382 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001383
1384 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 /* String */
1386 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001387 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001388 int quote = c;
1389 int triple = 0;
1390 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 for (;;) {
1392 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001393 if (c == '\n') {
1394 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001395 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001396 tok_backup(tok, c);
1397 return ERRORTOKEN;
1398 }
1399 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001400 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001401 }
1402 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001403 if (triple)
1404 tok->done = E_EOFS;
1405 else
1406 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001407 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 return ERRORTOKEN;
1409 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 else if (c == quote) {
1411 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001412 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001413 c = tok_nextc(tok);
1414 if (c == quote) {
1415 triple = 1;
1416 tripcount = 0;
1417 continue;
1418 }
1419 tok_backup(tok, c);
1420 }
1421 if (!triple || tripcount == 3)
1422 break;
1423 }
1424 else if (c == '\\') {
1425 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001428 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001429 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 return ERRORTOKEN;
1431 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001432 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001433 else
1434 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001436 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001437 *p_end = tok->cur;
1438 return STRING;
1439 }
1440
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 /* Line continuation */
1442 if (c == '\\') {
1443 c = tok_nextc(tok);
1444 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001445 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001446 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001447 return ERRORTOKEN;
1448 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001449 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001450 goto again; /* Read next line */
1451 }
1452
Guido van Rossumfbab9051991-10-20 20:25:03 +00001453 /* Check for two-character token */
1454 {
1455 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001456 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001457 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001458 int c3 = tok_nextc(tok);
1459 int token3 = PyToken_ThreeChars(c, c2, c3);
1460 if (token3 != OP) {
1461 token = token3;
1462 } else {
1463 tok_backup(tok, c3);
1464 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001465 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001466 *p_end = tok->cur;
1467 return token;
1468 }
1469 tok_backup(tok, c2);
1470 }
1471
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001472 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001473 switch (c) {
1474 case '(':
1475 case '[':
1476 case '{':
1477 tok->level++;
1478 break;
1479 case ')':
1480 case ']':
1481 case '}':
1482 tok->level--;
1483 break;
1484 }
1485
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001487 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001488 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001489 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001490}
1491
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001492int
1493PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1494{
1495 int result = tok_get(tok, p_start, p_end);
1496 if (tok->decoding_erred) {
1497 result = ERRORTOKEN;
1498 tok->done = E_DECODE;
1499 }
1500 return result;
1501}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502
Guido van Rossum408027e1996-12-30 16:17:54 +00001503#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504
1505void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001506tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001507{
Guido van Rossum86bea461997-04-29 21:03:06 +00001508 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001509 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1510 printf("(%.*s)", (int)(end - start), start);
1511}
1512
1513#endif