blob: 8fc2c267deaba4e42fb03ab292368f5bab2356d7 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Guido van Rossum86bea461997-04-29 21:03:06 +0000108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
130 tok->issued_encoding_warning = 0;
131 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000132 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000133#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000136#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000137 return tok;
138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_DEL(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
173new_string(const char *s, int len)
174{
175 char* result = PyMem_NEW(char, len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
209get_coding_spec(const char *s, int size)
210{
211 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 while (isalnum((int)t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 PyMem_DEL(r);
241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
256check_coding_spec(const char* line, int size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000280#else
281 /* Without Unicode support, we cannot
282 process the coding spec. Since there
283 won't be any Unicode literals, that
284 won't matter. */
285#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 } else { /* then, compare cs with BOM */
288 r = (strcmp(tok->encoding, cs) == 0);
289 PyMem_DEL(cs);
290 }
291 }
292 return r;
293}
294
295/* See whether the file starts with a BOM. If it does,
296 invoke the set_readline function with the new encoding.
297 Return 1 on success, 0 on failure. */
298
299static int
300check_bom(int get_char(struct tok_state *),
301 void unget_char(int, struct tok_state *),
302 int set_readline(struct tok_state *, const char *),
303 struct tok_state *tok)
304{
305 int ch = get_char(tok);
306 tok->decoding_state = 1;
307 if (ch == EOF) {
308 return 1;
309 } else if (ch == 0xEF) {
310 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
311 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
312#if 0
313 /* Disable support for UTF-16 BOMs until a decision
314 is made whether this needs to be supported. */
315 } else if (ch == 0xFE) {
316 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
317 if (!set_readline(tok, "utf-16-be")) return 0;
318 tok->decoding_state = -1;
319 } else if (ch == 0xFF) {
320 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
321 if (!set_readline(tok, "utf-16-le")) return 0;
322 tok->decoding_state = -1;
323#endif
324 } else {
325 unget_char(ch, tok);
326 return 1;
327 }
328 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
329 return 1;
330 NON_BOM:
331 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
332 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
333 return 1;
334}
335
336/* Read a line of text from TOK into S, using the stream in TOK.
337 Return NULL on failure, else S. */
338
339static char *
340fp_readl(char *s, int size, struct tok_state *tok)
341{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000342#ifndef Py_USING_UNICODE
343 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000344 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000345 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000346#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000347 PyObject* utf8;
348 PyObject* buf = tok->decoding_buffer;
349 if (buf == NULL) {
Mark Hammonda2e303c2003-01-14 23:15:22 +0000350 /* Ask for one less byte so we can terminate it */
351 PyObject *args = Py_BuildValue("(i)", size-1);
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000352 if (args == NULL)
353 return error_ret(tok);
354 buf = PyObject_Call(tok->decoding_readline, args, NULL);
355 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000356 if (buf == NULL)
357 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358 } else {
359 tok->decoding_buffer = NULL;
360 }
361 utf8 = PyUnicode_AsUTF8String(buf);
362 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000363 if (utf8 == NULL)
364 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365 else {
366 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000367 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368 strcpy(s, str);
369 Py_DECREF(utf8);
370 if (s[0] == '\0') return NULL; /* EOF */
371 return s;
372 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000373#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374}
375
376/* Set the readline function for TOK to a StreamReader's
377 readline function. The StreamReader is named ENC.
378
379 This function is called from check_bom and check_coding_spec.
380
381 ENC is usually identical to the future value of tok->encoding,
382 except for the (currently unsupported) case of UTF-16.
383
384 Return 1 on success, 0 on failure. */
385
386static int
387fp_setreadl(struct tok_state *tok, const char* enc)
388{
389 PyObject *reader, *stream, *readline;
390
Martin v. Löwis95292d62002-12-11 14:04:59 +0000391 /* XXX: constify filename argument. */
392 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000393 if (stream == NULL)
394 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395
396 reader = PyCodec_StreamReader(enc, stream, NULL);
397 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000398 if (reader == NULL)
399 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400
401 readline = PyObject_GetAttrString(reader, "readline");
402 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000403 if (readline == NULL)
404 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000405
406 tok->decoding_readline = readline;
407 return 1;
408}
409
410/* Fetch the next byte from TOK. */
411
412static int fp_getc(struct tok_state *tok) {
413 return getc(tok->fp);
414}
415
416/* Unfetch the last byte back into TOK. */
417
418static void fp_ungetc(int c, struct tok_state *tok) {
419 ungetc(c, tok->fp);
420}
421
422/* Read a line of input from TOK. Determine encoding
423 if necessary. */
424
425static char *
426decoding_fgets(char *s, int size, struct tok_state *tok)
427{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000428 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000430 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000431 if (tok->decoding_state < 0) {
432 /* We already have a codec associated with
433 this input. */
434 line = fp_readl(s, size, tok);
435 break;
436 } else if (tok->decoding_state > 0) {
437 /* We want a 'raw' read. */
438 line = Py_UniversalNewlineFgets(s, size,
439 tok->fp, NULL);
440 warn = 1;
441 break;
442 } else {
443 /* We have not yet determined the encoding.
444 If an encoding is found, use the file-pointer
445 reader functions from now on. */
446 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
447 return error_ret(tok);
448 assert(tok->decoding_state != 0);
449 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000450 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000451 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
452 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
453 return error_ret(tok);
454 }
455 }
456#ifndef PGEN
457 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
458 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000459 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460 if (*c > 127) {
461 badchar = *c;
462 break;
463 }
464 }
465 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000466 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000467 /* Need to add 1 to the line number, since this line
468 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000469 sprintf(buf,
470 "Non-ASCII character '\\x%.2x' "
471 "in file %.200s on line %i, "
472 "but no encoding declared; "
473 "see http://www.python.org/peps/pep-0263.html for details",
474 badchar, tok->filename, tok->lineno + 1);
475 /* We don't use PyErr_WarnExplicit() here because
476 printing the line in question to e.g. a log file
477 could result in sensitive information being
478 exposed. */
479 PyErr_Warn(PyExc_DeprecationWarning, buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480 tok->issued_encoding_warning = 1;
481 }
482#endif
483 return line;
484}
485
486static int
487decoding_feof(struct tok_state *tok)
488{
489 if (tok->decoding_state >= 0) {
490 return feof(tok->fp);
491 } else {
492 PyObject* buf = tok->decoding_buffer;
493 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000494 PyObject *args = PyTuple_New(0);
495 if (args == NULL) {
496 error_ret(tok);
497 return 1;
498 }
499 buf = PyObject_Call(tok->decoding_readline,
500 args, NULL);
501 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000502 if (buf == NULL) {
503 error_ret(tok);
504 return 1;
505 } else {
506 tok->decoding_buffer = buf;
507 }
508 }
509 return PyObject_Length(buf) == 0;
510 }
511}
512
513/* Fetch a byte from TOK, using the string buffer. */
514
515static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000516 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517}
518
519/* Unfetch a byte from TOK, using the string buffer. */
520
521static void buf_ungetc(int c, struct tok_state *tok) {
522 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000523 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524}
525
526/* Set the readline function for TOK to ENC. For the string-based
527 tokenizer, this means to just record the encoding. */
528
529static int buf_setreadl(struct tok_state *tok, const char* enc) {
530 tok->enc = enc;
531 return 1;
532}
533
534/* Return a UTF-8 encoding Python string object from the
535 C byte string STR, which is encoded with ENC. */
536
Martin v. Löwis019934b2002-08-07 12:33:18 +0000537#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538static PyObject *
539translate_into_utf8(const char* str, const char* enc) {
540 PyObject *utf8;
541 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
542 if (buf == NULL)
543 return NULL;
544 utf8 = PyUnicode_AsUTF8String(buf);
545 Py_DECREF(buf);
546 return utf8;
547}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000548#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549
550/* Decode a byte string STR for use as the buffer of TOK.
551 Look for encoding declarations inside STR, and record them
552 inside TOK. */
553
554static const char *
555decode_str(const char *str, struct tok_state *tok)
556{
557 PyObject* utf8 = NULL;
558 const char *s;
559 int lineno = 0;
560 tok->enc = NULL;
561 tok->str = str;
562 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
563 return NULL;
564 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000565 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000566#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567 if (tok->enc != NULL) {
568 utf8 = translate_into_utf8(str, tok->enc);
569 if (utf8 == NULL)
570 return NULL;
571 str = PyString_AsString(utf8);
572 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000573#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574 for (s = str;; s++) {
575 if (*s == '\0') break;
576 else if (*s == '\n') {
577 lineno++;
578 if (lineno == 2) break;
579 }
580 }
581 tok->enc = NULL;
582 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
583 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000584#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 if (tok->enc != NULL) {
586 assert(utf8 == NULL);
587 utf8 = translate_into_utf8(str, tok->enc);
588 if (utf8 == NULL)
589 return NULL;
590 str = PyString_AsString(utf8);
591 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000592#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593 assert(tok->decoding_buffer == NULL);
594 tok->decoding_buffer = utf8; /* CAUTION */
595 return str;
596}
597
598#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000599
600/* Set up tokenizer for string */
601
602struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000603PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000604{
605 struct tok_state *tok = tok_new();
606 if (tok == NULL)
607 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 str = (char *)decode_str(str, tok);
609 if (str == NULL)
610 return NULL;
Martin v. Löwis95292d62002-12-11 14:04:59 +0000611 /* XXX: constify members. */
612 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000613 return tok;
614}
615
616
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000617/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000618
619struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000620PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000621{
622 struct tok_state *tok = tok_new();
623 if (tok == NULL)
624 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000625 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
626 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000627 return NULL;
628 }
629 tok->cur = tok->inp = tok->buf;
630 tok->end = tok->buf + BUFSIZ;
631 tok->fp = fp;
632 tok->prompt = ps1;
633 tok->nextprompt = ps2;
634 return tok;
635}
636
637
638/* Free a tok_state structure */
639
640void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000641PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643 if (tok->encoding != NULL)
644 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000645#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646 Py_XDECREF(tok->decoding_readline);
647 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000648#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000650 PyMem_DEL(tok->buf);
651 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652}
653
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000654#if !defined(PGEN) && defined(Py_USING_UNICODE)
655static int
656tok_stdin_decode(struct tok_state *tok, char **inp)
657{
658 PyObject *enc, *sysstdin, *decoded, *utf8;
659 const char *encoding;
660 char *converted;
661
662 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
663 return 0;
664 sysstdin = PySys_GetObject("stdin");
665 if (sysstdin == NULL || !PyFile_Check(sysstdin))
666 return 0;
667
668 enc = ((PyFileObject *)sysstdin)->f_encoding;
669 if (enc == NULL || !PyString_Check(enc))
670 return 0;
671 Py_INCREF(enc);
672
673 encoding = PyString_AsString(enc);
674 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
675 if (decoded == NULL)
676 goto error_clear;
677
678 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
679 Py_DECREF(decoded);
680 if (utf8 == NULL)
681 goto error_clear;
682
683 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
684 Py_DECREF(utf8);
685 if (converted == NULL)
686 goto error_nomem;
687
688 PyMem_FREE(*inp);
689 *inp = converted;
690 if (tok->encoding != NULL)
691 PyMem_DEL(tok->encoding);
692 tok->encoding = new_string(encoding, strlen(encoding));
693 if (tok->encoding == NULL)
694 goto error_nomem;
695
696 Py_DECREF(enc);
697 return 0;
698
699error_nomem:
700 Py_DECREF(enc);
701 tok->done = E_NOMEM;
702 return -1;
703
704error_clear:
705 /* Fallback to iso-8859-1: for backward compatibility */
706 Py_DECREF(enc);
707 PyErr_Clear();
708 return 0;
709}
710#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711
712/* Get next char, updating state; error code goes into tok->done */
713
714static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000715tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000716{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000718 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000719 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000720 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000721 if (tok->done != E_OK)
722 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000724 char *end = strchr(tok->inp, '\n');
725 if (end != NULL)
726 end++;
727 else {
728 end = strchr(tok->inp, '\0');
729 if (end == tok->inp) {
730 tok->done = E_EOF;
731 return EOF;
732 }
733 }
734 if (tok->start == NULL)
735 tok->buf = tok->cur;
736 tok->lineno++;
737 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000738 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000741 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 if (tok->nextprompt != NULL)
743 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000744 if (new == NULL)
745 tok->done = E_INTR;
746 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000747 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748 tok->done = E_EOF;
749 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000750#if !defined(PGEN) && defined(Py_USING_UNICODE)
751 else if (tok_stdin_decode(tok, &new) != 0)
752 PyMem_FREE(new);
753#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000754 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000755 size_t start = tok->start - tok->buf;
756 size_t oldlen = tok->cur - tok->buf;
757 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000758 char *buf = tok->buf;
759 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000760 tok->lineno++;
761 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000762 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000763 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000764 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000765 tok->done = E_NOMEM;
766 return EOF;
767 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000768 tok->buf = buf;
769 tok->cur = tok->buf + oldlen;
770 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000771 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000772 tok->inp = tok->buf + newlen;
773 tok->end = tok->inp + 1;
774 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000775 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000776 else {
777 tok->lineno++;
778 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000779 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000780 tok->buf = new;
781 tok->cur = tok->buf;
782 tok->inp = strchr(tok->buf, '\0');
783 tok->end = tok->inp + 1;
784 }
785 }
786 else {
787 int done = 0;
788 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000789 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000790 if (tok->start == NULL) {
791 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000792 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 if (tok->buf == NULL) {
794 tok->done = E_NOMEM;
795 return EOF;
796 }
797 tok->end = tok->buf + BUFSIZ;
798 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000799 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
800 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000801 tok->done = E_EOF;
802 done = 1;
803 }
804 else {
805 tok->done = E_OK;
806 tok->inp = strchr(tok->buf, '\0');
807 done = tok->inp[-1] == '\n';
808 }
809 }
810 else {
811 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000812 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000813 tok->done = E_EOF;
814 done = 1;
815 }
816 else
817 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 }
819 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000820 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 while (!done) {
822 int curstart = tok->start == NULL ? -1 :
823 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000824 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000825 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000826 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000827 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000828 if (newbuf == NULL) {
829 tok->done = E_NOMEM;
830 tok->cur = tok->inp;
831 return EOF;
832 }
833 tok->buf = newbuf;
834 tok->inp = tok->buf + curvalid;
835 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 tok->start = curstart < 0 ? NULL :
837 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000838 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000839 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000840 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000841 /* Last line does not end in \n,
842 fake one */
843 strcpy(tok->inp, "\n");
844 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000845 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000847 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 tok->cur = tok->buf + cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000849 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000850 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000851 pt = tok->inp - 2;
852 if (pt >= tok->buf && *pt == '\r') {
853 *pt++ = '\n';
854 *pt = '\0';
855 tok->inp = pt;
856 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857 }
858 if (tok->done != E_OK) {
859 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000860 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000861 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862 return EOF;
863 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000864 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000866}
867
868
869/* Back-up one character */
870
871static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000872tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873{
874 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000875 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000876 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000877 if (*tok->cur != c)
878 *tok->cur = c;
879 }
880}
881
882
883/* Return the token corresponding to a single character */
884
885int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000886PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887{
888 switch (c) {
889 case '(': return LPAR;
890 case ')': return RPAR;
891 case '[': return LSQB;
892 case ']': return RSQB;
893 case ':': return COLON;
894 case ',': return COMMA;
895 case ';': return SEMI;
896 case '+': return PLUS;
897 case '-': return MINUS;
898 case '*': return STAR;
899 case '/': return SLASH;
900 case '|': return VBAR;
901 case '&': return AMPER;
902 case '<': return LESS;
903 case '>': return GREATER;
904 case '=': return EQUAL;
905 case '.': return DOT;
906 case '%': return PERCENT;
907 case '`': return BACKQUOTE;
908 case '{': return LBRACE;
909 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000910 case '^': return CIRCUMFLEX;
911 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000912 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913 default: return OP;
914 }
915}
916
917
Guido van Rossumfbab9051991-10-20 20:25:03 +0000918int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000919PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000920{
921 switch (c1) {
922 case '=':
923 switch (c2) {
924 case '=': return EQEQUAL;
925 }
926 break;
927 case '!':
928 switch (c2) {
929 case '=': return NOTEQUAL;
930 }
931 break;
932 case '<':
933 switch (c2) {
934 case '>': return NOTEQUAL;
935 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000936 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000937 }
938 break;
939 case '>':
940 switch (c2) {
941 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000942 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000943 }
944 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000945 case '+':
946 switch (c2) {
947 case '=': return PLUSEQUAL;
948 }
949 break;
950 case '-':
951 switch (c2) {
952 case '=': return MINEQUAL;
953 }
954 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000955 case '*':
956 switch (c2) {
957 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000958 case '=': return STAREQUAL;
959 }
960 break;
961 case '/':
962 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000963 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000964 case '=': return SLASHEQUAL;
965 }
966 break;
967 case '|':
968 switch (c2) {
969 case '=': return VBAREQUAL;
970 }
971 break;
972 case '%':
973 switch (c2) {
974 case '=': return PERCENTEQUAL;
975 }
976 break;
977 case '&':
978 switch (c2) {
979 case '=': return AMPEREQUAL;
980 }
981 break;
982 case '^':
983 switch (c2) {
984 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000985 }
986 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000987 }
988 return OP;
989}
990
Thomas Wouters434d0822000-08-24 20:11:32 +0000991int
992PyToken_ThreeChars(int c1, int c2, int c3)
993{
994 switch (c1) {
995 case '<':
996 switch (c2) {
997 case '<':
998 switch (c3) {
999 case '=':
1000 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001001 }
1002 break;
1003 }
1004 break;
1005 case '>':
1006 switch (c2) {
1007 case '>':
1008 switch (c3) {
1009 case '=':
1010 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001011 }
1012 break;
1013 }
1014 break;
1015 case '*':
1016 switch (c2) {
1017 case '*':
1018 switch (c3) {
1019 case '=':
1020 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001021 }
1022 break;
1023 }
1024 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001025 case '/':
1026 switch (c2) {
1027 case '/':
1028 switch (c3) {
1029 case '=':
1030 return DOUBLESLASHEQUAL;
1031 }
1032 break;
1033 }
1034 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001035 }
1036 return OP;
1037}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001038
Guido van Rossum926f13a1998-04-09 21:38:06 +00001039static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001040indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001041{
1042 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001043 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001044 tok->cur = tok->inp;
1045 return 1;
1046 }
1047 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001048 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1049 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001050 tok->altwarning = 0;
1051 }
1052 return 0;
1053}
1054
1055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056/* Get next token, after space stripping etc. */
1057
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001058static int
1059tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001060{
1061 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001062 int blankline;
1063
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001064 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001065 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001066 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001067 blankline = 0;
1068
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001069 /* Get indentation level */
1070 if (tok->atbol) {
1071 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001072 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074 for (;;) {
1075 c = tok_nextc(tok);
1076 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001077 col++, altcol++;
1078 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001079 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001080 altcol = (altcol/tok->alttabsize + 1)
1081 * tok->alttabsize;
1082 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001083 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001084 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001085 else
1086 break;
1087 }
1088 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001089 if (c == '#' || c == '\n') {
1090 /* Lines with only whitespace and/or comments
1091 shouldn't affect the indentation and are
1092 not passed to the parser as NEWLINE tokens,
1093 except *totally* empty lines in interactive
1094 mode, which signal the end of a command group. */
1095 if (col == 0 && c == '\n' && tok->prompt != NULL)
1096 blankline = 0; /* Let it through */
1097 else
1098 blankline = 1; /* Ignore completely */
1099 /* We can't jump back right here since we still
1100 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001102 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001103 if (col == tok->indstack[tok->indent]) {
1104 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105 if (altcol != tok->altindstack[tok->indent]) {
1106 if (indenterror(tok))
1107 return ERRORTOKEN;
1108 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 else if (col > tok->indstack[tok->indent]) {
1111 /* Indent -- always one */
1112 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001113 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001114 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001115 return ERRORTOKEN;
1116 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001117 if (altcol <= tok->altindstack[tok->indent]) {
1118 if (indenterror(tok))
1119 return ERRORTOKEN;
1120 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121 tok->pendin++;
1122 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001124 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001125 else /* col < tok->indstack[tok->indent] */ {
1126 /* Dedent -- any number, must be consistent */
1127 while (tok->indent > 0 &&
1128 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001129 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001130 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001131 }
1132 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001133 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001134 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001135 return ERRORTOKEN;
1136 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001137 if (altcol != tok->altindstack[tok->indent]) {
1138 if (indenterror(tok))
1139 return ERRORTOKEN;
1140 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141 }
1142 }
1143 }
1144
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001145 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146
1147 /* Return pending indents/dedents */
1148 if (tok->pendin != 0) {
1149 if (tok->pendin < 0) {
1150 tok->pendin++;
1151 return DEDENT;
1152 }
1153 else {
1154 tok->pendin--;
1155 return INDENT;
1156 }
1157 }
1158
1159 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001160 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001161 /* Skip spaces */
1162 do {
1163 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001164 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165
1166 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001167 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001168
Guido van Rossumab5ca152000-03-31 00:52:27 +00001169 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001170 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001171 static char *tabforms[] = {
1172 "tab-width:", /* Emacs */
1173 ":tabstop=", /* vim, full form */
1174 ":ts=", /* vim, abbreviated form */
1175 "set tabsize=", /* will vi never die? */
1176 /* more templates can be added here to support other editors */
1177 };
1178 char cbuf[80];
1179 char *tp, **cp;
1180 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001181 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001182 *tp++ = c = tok_nextc(tok);
1183 } while (c != EOF && c != '\n' &&
1184 tp - cbuf + 1 < sizeof(cbuf));
1185 *tp = '\0';
1186 for (cp = tabforms;
1187 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1188 cp++) {
1189 if ((tp = strstr(cbuf, *cp))) {
1190 int newsize = atoi(tp + strlen(*cp));
1191
1192 if (newsize >= 1 && newsize <= 40) {
1193 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001194 if (Py_VerboseFlag)
1195 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001196 "Tab size set to %d\n",
1197 newsize);
1198 }
1199 }
1200 }
1201 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203 }
1204
1205 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001206 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001208 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209
1210 /* Identifier (most frequent token!) */
1211 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001212 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001213 switch (c) {
1214 case 'r':
1215 case 'R':
1216 c = tok_nextc(tok);
1217 if (c == '"' || c == '\'')
1218 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001219 break;
1220 case 'u':
1221 case 'U':
1222 c = tok_nextc(tok);
1223 if (c == 'r' || c == 'R')
1224 c = tok_nextc(tok);
1225 if (c == '"' || c == '\'')
1226 goto letter_quote;
1227 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001228 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001229 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001230 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001231 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001233 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 *p_end = tok->cur;
1235 return NAME;
1236 }
1237
1238 /* Newline */
1239 if (c == '\n') {
1240 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001241 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001242 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001243 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001245 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 return NEWLINE;
1247 }
1248
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001249 /* Period or number starting with period? */
1250 if (c == '.') {
1251 c = tok_nextc(tok);
1252 if (isdigit(c)) {
1253 goto fraction;
1254 }
1255 else {
1256 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001257 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001258 *p_end = tok->cur;
1259 return DOT;
1260 }
1261 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001262
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 /* Number */
1264 if (isdigit(c)) {
1265 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001266 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 c = tok_nextc(tok);
1268 if (c == '.')
1269 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001270#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001271 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001272 goto imaginary;
1273#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001274 if (c == 'x' || c == 'X') {
1275 /* Hex */
1276 do {
1277 c = tok_nextc(tok);
1278 } while (isxdigit(c));
1279 }
1280 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001281 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 /* Octal; c is first char of it */
1283 /* There's no 'isoctdigit' macro, sigh */
1284 while ('0' <= c && c < '8') {
1285 c = tok_nextc(tok);
1286 }
Tim Petersd507dab2001-08-30 20:51:59 +00001287 if (isdigit(c)) {
1288 found_decimal = 1;
1289 do {
1290 c = tok_nextc(tok);
1291 } while (isdigit(c));
1292 }
1293 if (c == '.')
1294 goto fraction;
1295 else if (c == 'e' || c == 'E')
1296 goto exponent;
1297#ifndef WITHOUT_COMPLEX
1298 else if (c == 'j' || c == 'J')
1299 goto imaginary;
1300#endif
1301 else if (found_decimal) {
1302 tok->done = E_TOKEN;
1303 tok_backup(tok, c);
1304 return ERRORTOKEN;
1305 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001307 if (c == 'l' || c == 'L')
1308 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 }
1310 else {
1311 /* Decimal */
1312 do {
1313 c = tok_nextc(tok);
1314 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001315 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001317 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001318 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001319 if (c == '.') {
1320 fraction:
1321 /* Fraction */
1322 do {
1323 c = tok_nextc(tok);
1324 } while (isdigit(c));
1325 }
1326 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001327 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001328 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001330 if (c == '+' || c == '-')
1331 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001332 if (!isdigit(c)) {
1333 tok->done = E_TOKEN;
1334 tok_backup(tok, c);
1335 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001336 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001337 do {
1338 c = tok_nextc(tok);
1339 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001340 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001341#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001342 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001343 /* Imaginary part */
1344 imaginary:
1345 c = tok_nextc(tok);
1346#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 }
1348 }
1349 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001350 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 *p_end = tok->cur;
1352 return NUMBER;
1353 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001354
1355 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001356 /* String */
1357 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001358 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001359 int quote = c;
1360 int triple = 0;
1361 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 for (;;) {
1363 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001364 if (c == '\n') {
1365 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001366 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001367 tok_backup(tok, c);
1368 return ERRORTOKEN;
1369 }
1370 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001371 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001372 }
1373 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001374 if (triple)
1375 tok->done = E_EOFS;
1376 else
1377 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001378 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 return ERRORTOKEN;
1380 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001381 else if (c == quote) {
1382 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001383 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 c = tok_nextc(tok);
1385 if (c == quote) {
1386 triple = 1;
1387 tripcount = 0;
1388 continue;
1389 }
1390 tok_backup(tok, c);
1391 }
1392 if (!triple || tripcount == 3)
1393 break;
1394 }
1395 else if (c == '\\') {
1396 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001399 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001400 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 return ERRORTOKEN;
1402 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001404 else
1405 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001406 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001408 *p_end = tok->cur;
1409 return STRING;
1410 }
1411
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412 /* Line continuation */
1413 if (c == '\\') {
1414 c = tok_nextc(tok);
1415 if (c != '\n') {
1416 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001417 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 return ERRORTOKEN;
1419 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001420 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 goto again; /* Read next line */
1422 }
1423
Guido van Rossumfbab9051991-10-20 20:25:03 +00001424 /* Check for two-character token */
1425 {
1426 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001427 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001428 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001429 int c3 = tok_nextc(tok);
1430 int token3 = PyToken_ThreeChars(c, c2, c3);
1431 if (token3 != OP) {
1432 token = token3;
1433 } else {
1434 tok_backup(tok, c3);
1435 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001436 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001437 *p_end = tok->cur;
1438 return token;
1439 }
1440 tok_backup(tok, c2);
1441 }
1442
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001443 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001444 switch (c) {
1445 case '(':
1446 case '[':
1447 case '{':
1448 tok->level++;
1449 break;
1450 case ')':
1451 case ']':
1452 case '}':
1453 tok->level--;
1454 break;
1455 }
1456
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001457 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001458 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001460 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001461}
1462
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001463int
1464PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1465{
1466 int result = tok_get(tok, p_start, p_end);
1467 if (tok->decoding_erred) {
1468 result = ERRORTOKEN;
1469 tok->done = E_DECODE;
1470 }
1471 return result;
1472}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001473
Guido van Rossum408027e1996-12-30 16:17:54 +00001474#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001475
1476void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001477tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478{
Guido van Rossum86bea461997-04-29 21:03:06 +00001479 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001480 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1481 printf("(%.*s)", (int)(end - start), start);
1482}
1483
1484#endif