blob: 749a59b68c285b49d98484d159128080f3c4cdf5 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
172new_string(const char *s, int len)
173{
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
208get_coding_spec(const char *s, int size)
209{
210 int i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
231 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
232 t[0] == '.')
233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
255check_coding_spec(const char* line, int size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000279#else
280 /* Without Unicode support, we cannot
281 process the coding spec. Since there
282 won't be any Unicode literals, that
283 won't matter. */
284#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
286 } else { /* then, compare cs with BOM */
287 r = (strcmp(tok->encoding, cs) == 0);
288 PyMem_DEL(cs);
289 }
290 }
291 return r;
292}
293
294/* See whether the file starts with a BOM. If it does,
295 invoke the set_readline function with the new encoding.
296 Return 1 on success, 0 on failure. */
297
298static int
299check_bom(int get_char(struct tok_state *),
300 void unget_char(int, struct tok_state *),
301 int set_readline(struct tok_state *, const char *),
302 struct tok_state *tok)
303{
304 int ch = get_char(tok);
305 tok->decoding_state = 1;
306 if (ch == EOF) {
307 return 1;
308 } else if (ch == 0xEF) {
309 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
310 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
311#if 0
312 /* Disable support for UTF-16 BOMs until a decision
313 is made whether this needs to be supported. */
314 } else if (ch == 0xFE) {
315 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
316 if (!set_readline(tok, "utf-16-be")) return 0;
317 tok->decoding_state = -1;
318 } else if (ch == 0xFF) {
319 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
320 if (!set_readline(tok, "utf-16-le")) return 0;
321 tok->decoding_state = -1;
322#endif
323 } else {
324 unget_char(ch, tok);
325 return 1;
326 }
327 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
328 return 1;
329 NON_BOM:
330 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
331 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
332 return 1;
333}
334
335/* Read a line of text from TOK into S, using the stream in TOK.
336 Return NULL on failure, else S. */
337
338static char *
339fp_readl(char *s, int size, struct tok_state *tok)
340{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000341#ifndef Py_USING_UNICODE
342 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000343 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000344 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000345#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 PyObject* utf8;
347 PyObject* buf = tok->decoding_buffer;
348 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000349 PyObject *args = PyTuple_New(0);
350 if (args == NULL)
351 return error_ret(tok);
352 buf = PyObject_Call(tok->decoding_readline, args, NULL);
353 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000354 if (buf == NULL)
355 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356 } else {
357 tok->decoding_buffer = NULL;
358 }
359 utf8 = PyUnicode_AsUTF8String(buf);
360 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000361 if (utf8 == NULL)
362 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363 else {
364 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000365 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366 strcpy(s, str);
367 Py_DECREF(utf8);
368 if (s[0] == '\0') return NULL; /* EOF */
369 return s;
370 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000371#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372}
373
374/* Set the readline function for TOK to a StreamReader's
375 readline function. The StreamReader is named ENC.
376
377 This function is called from check_bom and check_coding_spec.
378
379 ENC is usually identical to the future value of tok->encoding,
380 except for the (currently unsupported) case of UTF-16.
381
382 Return 1 on success, 0 on failure. */
383
384static int
385fp_setreadl(struct tok_state *tok, const char* enc)
386{
387 PyObject *reader, *stream, *readline;
388
389 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000390 if (stream == NULL)
391 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392
393 reader = PyCodec_StreamReader(enc, stream, NULL);
394 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000395 if (reader == NULL)
396 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397
398 readline = PyObject_GetAttrString(reader, "readline");
399 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000400 if (readline == NULL)
401 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000402
403 tok->decoding_readline = readline;
404 return 1;
405}
406
407/* Fetch the next byte from TOK. */
408
409static int fp_getc(struct tok_state *tok) {
410 return getc(tok->fp);
411}
412
413/* Unfetch the last byte back into TOK. */
414
415static void fp_ungetc(int c, struct tok_state *tok) {
416 ungetc(c, tok->fp);
417}
418
419/* Read a line of input from TOK. Determine encoding
420 if necessary. */
421
422static char *
423decoding_fgets(char *s, int size, struct tok_state *tok)
424{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000425 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000426 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000427 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428 if (tok->decoding_state < 0) {
429 /* We already have a codec associated with
430 this input. */
431 line = fp_readl(s, size, tok);
432 break;
433 } else if (tok->decoding_state > 0) {
434 /* We want a 'raw' read. */
435 line = Py_UniversalNewlineFgets(s, size,
436 tok->fp, NULL);
437 warn = 1;
438 break;
439 } else {
440 /* We have not yet determined the encoding.
441 If an encoding is found, use the file-pointer
442 reader functions from now on. */
443 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
444 return error_ret(tok);
445 assert(tok->decoding_state != 0);
446 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000447 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000448 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
449 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
450 return error_ret(tok);
451 }
452 }
453#ifndef PGEN
454 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
455 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000456 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000457 if (*c > 127) {
458 badchar = *c;
459 break;
460 }
461 }
462 if (badchar) {
463 char buf[200];
464 sprintf(buf, "Non-ASCII character '\\x%.2x', "
465 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000466 /* Need to add 1 to the line number, since this line
467 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000469 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470 NULL, NULL);
471 tok->issued_encoding_warning = 1;
472 }
473#endif
474 return line;
475}
476
477static int
478decoding_feof(struct tok_state *tok)
479{
480 if (tok->decoding_state >= 0) {
481 return feof(tok->fp);
482 } else {
483 PyObject* buf = tok->decoding_buffer;
484 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000485 PyObject *args = PyTuple_New(0);
486 if (args == NULL) {
487 error_ret(tok);
488 return 1;
489 }
490 buf = PyObject_Call(tok->decoding_readline,
491 args, NULL);
492 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493 if (buf == NULL) {
494 error_ret(tok);
495 return 1;
496 } else {
497 tok->decoding_buffer = buf;
498 }
499 }
500 return PyObject_Length(buf) == 0;
501 }
502}
503
504/* Fetch a byte from TOK, using the string buffer. */
505
506static int buf_getc(struct tok_state *tok) {
507 return *tok->str++;
508}
509
510/* Unfetch a byte from TOK, using the string buffer. */
511
512static void buf_ungetc(int c, struct tok_state *tok) {
513 tok->str--;
514 assert(*tok->str == c); /* tok->cur may point to read-only segment */
515}
516
517/* Set the readline function for TOK to ENC. For the string-based
518 tokenizer, this means to just record the encoding. */
519
520static int buf_setreadl(struct tok_state *tok, const char* enc) {
521 tok->enc = enc;
522 return 1;
523}
524
525/* Return a UTF-8 encoding Python string object from the
526 C byte string STR, which is encoded with ENC. */
527
Martin v. Löwis019934b2002-08-07 12:33:18 +0000528#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529static PyObject *
530translate_into_utf8(const char* str, const char* enc) {
531 PyObject *utf8;
532 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
533 if (buf == NULL)
534 return NULL;
535 utf8 = PyUnicode_AsUTF8String(buf);
536 Py_DECREF(buf);
537 return utf8;
538}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000539#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540
541/* Decode a byte string STR for use as the buffer of TOK.
542 Look for encoding declarations inside STR, and record them
543 inside TOK. */
544
545static const char *
546decode_str(const char *str, struct tok_state *tok)
547{
548 PyObject* utf8 = NULL;
549 const char *s;
550 int lineno = 0;
551 tok->enc = NULL;
552 tok->str = str;
553 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
554 return NULL;
555 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000556 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000557#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 if (tok->enc != NULL) {
559 utf8 = translate_into_utf8(str, tok->enc);
560 if (utf8 == NULL)
561 return NULL;
562 str = PyString_AsString(utf8);
563 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000564#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 for (s = str;; s++) {
566 if (*s == '\0') break;
567 else if (*s == '\n') {
568 lineno++;
569 if (lineno == 2) break;
570 }
571 }
572 tok->enc = NULL;
573 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
574 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000575#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576 if (tok->enc != NULL) {
577 assert(utf8 == NULL);
578 utf8 = translate_into_utf8(str, tok->enc);
579 if (utf8 == NULL)
580 return NULL;
581 str = PyString_AsString(utf8);
582 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000583#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584 assert(tok->decoding_buffer == NULL);
585 tok->decoding_buffer = utf8; /* CAUTION */
586 return str;
587}
588
589#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000590
591/* Set up tokenizer for string */
592
593struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000594PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000595{
596 struct tok_state *tok = tok_new();
597 if (tok == NULL)
598 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 str = (char *)decode_str(str, tok);
600 if (str == NULL)
601 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000602 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603 return tok;
604}
605
606
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000607/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608
609struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000610PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000611{
612 struct tok_state *tok = tok_new();
613 if (tok == NULL)
614 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000615 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
616 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617 return NULL;
618 }
619 tok->cur = tok->inp = tok->buf;
620 tok->end = tok->buf + BUFSIZ;
621 tok->fp = fp;
622 tok->prompt = ps1;
623 tok->nextprompt = ps2;
624 return tok;
625}
626
627
628/* Free a tok_state structure */
629
630void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000631PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000632{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000633 if (tok->encoding != NULL)
634 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000635#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 Py_XDECREF(tok->decoding_readline);
637 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000638#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000639 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000640 PyMem_DEL(tok->buf);
641 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642}
643
644
645/* Get next char, updating state; error code goes into tok->done */
646
647static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000648tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000651 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000652 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000653 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000654 if (tok->done != E_OK)
655 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000657 char *end = strchr(tok->inp, '\n');
658 if (end != NULL)
659 end++;
660 else {
661 end = strchr(tok->inp, '\0');
662 if (end == tok->inp) {
663 tok->done = E_EOF;
664 return EOF;
665 }
666 }
667 if (tok->start == NULL)
668 tok->buf = tok->cur;
669 tok->lineno++;
670 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000671 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000674 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675 if (tok->nextprompt != NULL)
676 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000677 if (new == NULL)
678 tok->done = E_INTR;
679 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000680 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681 tok->done = E_EOF;
682 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000683 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000684 size_t start = tok->start - tok->buf;
685 size_t oldlen = tok->cur - tok->buf;
686 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000687 char *buf = tok->buf;
688 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000689 tok->lineno++;
690 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000691 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000692 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000693 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000694 tok->done = E_NOMEM;
695 return EOF;
696 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000697 tok->buf = buf;
698 tok->cur = tok->buf + oldlen;
699 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000700 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000701 tok->inp = tok->buf + newlen;
702 tok->end = tok->inp + 1;
703 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000704 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000705 else {
706 tok->lineno++;
707 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000708 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000709 tok->buf = new;
710 tok->cur = tok->buf;
711 tok->inp = strchr(tok->buf, '\0');
712 tok->end = tok->inp + 1;
713 }
714 }
715 else {
716 int done = 0;
717 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000718 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000719 if (tok->start == NULL) {
720 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000721 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000722 if (tok->buf == NULL) {
723 tok->done = E_NOMEM;
724 return EOF;
725 }
726 tok->end = tok->buf + BUFSIZ;
727 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000728 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
729 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000730 tok->done = E_EOF;
731 done = 1;
732 }
733 else {
734 tok->done = E_OK;
735 tok->inp = strchr(tok->buf, '\0');
736 done = tok->inp[-1] == '\n';
737 }
738 }
739 else {
740 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000741 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000742 tok->done = E_EOF;
743 done = 1;
744 }
745 else
746 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000747 }
748 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000749 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000750 while (!done) {
751 int curstart = tok->start == NULL ? -1 :
752 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000753 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000754 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000755 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000756 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000757 if (newbuf == NULL) {
758 tok->done = E_NOMEM;
759 tok->cur = tok->inp;
760 return EOF;
761 }
762 tok->buf = newbuf;
763 tok->inp = tok->buf + curvalid;
764 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000765 tok->start = curstart < 0 ? NULL :
766 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000767 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000768 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000769 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000770 /* Last line does not end in \n,
771 fake one */
772 strcpy(tok->inp, "\n");
773 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000774 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000775 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000776 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000777 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000778#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000779 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000780 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000781 pt = tok->inp - 2;
782 if (pt >= tok->buf && *pt == '\r') {
783 *pt++ = '\n';
784 *pt = '\0';
785 tok->inp = pt;
786 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000787#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788 }
789 if (tok->done != E_OK) {
790 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000791 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000792 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793 return EOF;
794 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000795 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000796 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797}
798
799
800/* Back-up one character */
801
802static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000803tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000804{
805 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000806 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000807 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808 if (*tok->cur != c)
809 *tok->cur = c;
810 }
811}
812
813
814/* Return the token corresponding to a single character */
815
816int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000817PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818{
819 switch (c) {
820 case '(': return LPAR;
821 case ')': return RPAR;
822 case '[': return LSQB;
823 case ']': return RSQB;
824 case ':': return COLON;
825 case ',': return COMMA;
826 case ';': return SEMI;
827 case '+': return PLUS;
828 case '-': return MINUS;
829 case '*': return STAR;
830 case '/': return SLASH;
831 case '|': return VBAR;
832 case '&': return AMPER;
833 case '<': return LESS;
834 case '>': return GREATER;
835 case '=': return EQUAL;
836 case '.': return DOT;
837 case '%': return PERCENT;
838 case '`': return BACKQUOTE;
839 case '{': return LBRACE;
840 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000841 case '^': return CIRCUMFLEX;
842 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843 default: return OP;
844 }
845}
846
847
Guido van Rossumfbab9051991-10-20 20:25:03 +0000848int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000849PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000850{
851 switch (c1) {
852 case '=':
853 switch (c2) {
854 case '=': return EQEQUAL;
855 }
856 break;
857 case '!':
858 switch (c2) {
859 case '=': return NOTEQUAL;
860 }
861 break;
862 case '<':
863 switch (c2) {
864 case '>': return NOTEQUAL;
865 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000866 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000867 }
868 break;
869 case '>':
870 switch (c2) {
871 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000872 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000873 }
874 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000875 case '+':
876 switch (c2) {
877 case '=': return PLUSEQUAL;
878 }
879 break;
880 case '-':
881 switch (c2) {
882 case '=': return MINEQUAL;
883 }
884 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000885 case '*':
886 switch (c2) {
887 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000888 case '=': return STAREQUAL;
889 }
890 break;
891 case '/':
892 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000893 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000894 case '=': return SLASHEQUAL;
895 }
896 break;
897 case '|':
898 switch (c2) {
899 case '=': return VBAREQUAL;
900 }
901 break;
902 case '%':
903 switch (c2) {
904 case '=': return PERCENTEQUAL;
905 }
906 break;
907 case '&':
908 switch (c2) {
909 case '=': return AMPEREQUAL;
910 }
911 break;
912 case '^':
913 switch (c2) {
914 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000915 }
916 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000917 }
918 return OP;
919}
920
Thomas Wouters434d0822000-08-24 20:11:32 +0000921int
922PyToken_ThreeChars(int c1, int c2, int c3)
923{
924 switch (c1) {
925 case '<':
926 switch (c2) {
927 case '<':
928 switch (c3) {
929 case '=':
930 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000931 }
932 break;
933 }
934 break;
935 case '>':
936 switch (c2) {
937 case '>':
938 switch (c3) {
939 case '=':
940 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000941 }
942 break;
943 }
944 break;
945 case '*':
946 switch (c2) {
947 case '*':
948 switch (c3) {
949 case '=':
950 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000951 }
952 break;
953 }
954 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000955 case '/':
956 switch (c2) {
957 case '/':
958 switch (c3) {
959 case '=':
960 return DOUBLESLASHEQUAL;
961 }
962 break;
963 }
964 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000965 }
966 return OP;
967}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000968
Guido van Rossum926f13a1998-04-09 21:38:06 +0000969static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000970indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000971{
972 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000973 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000974 tok->cur = tok->inp;
975 return 1;
976 }
977 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000978 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
979 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000980 tok->altwarning = 0;
981 }
982 return 0;
983}
984
985
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000986/* Get next token, after space stripping etc. */
987
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000988static int
989tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990{
991 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000992 int blankline;
993
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000994 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000995 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000996 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000997 blankline = 0;
998
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000999 /* Get indentation level */
1000 if (tok->atbol) {
1001 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001002 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001003 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001004 for (;;) {
1005 c = tok_nextc(tok);
1006 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001007 col++, altcol++;
1008 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001009 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001010 altcol = (altcol/tok->alttabsize + 1)
1011 * tok->alttabsize;
1012 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001013 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001014 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001015 else
1016 break;
1017 }
1018 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001019 if (c == '#' || c == '\n') {
1020 /* Lines with only whitespace and/or comments
1021 shouldn't affect the indentation and are
1022 not passed to the parser as NEWLINE tokens,
1023 except *totally* empty lines in interactive
1024 mode, which signal the end of a command group. */
1025 if (col == 0 && c == '\n' && tok->prompt != NULL)
1026 blankline = 0; /* Let it through */
1027 else
1028 blankline = 1; /* Ignore completely */
1029 /* We can't jump back right here since we still
1030 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001032 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001033 if (col == tok->indstack[tok->indent]) {
1034 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001035 if (altcol != tok->altindstack[tok->indent]) {
1036 if (indenterror(tok))
1037 return ERRORTOKEN;
1038 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001040 else if (col > tok->indstack[tok->indent]) {
1041 /* Indent -- always one */
1042 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001043 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001044 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001045 return ERRORTOKEN;
1046 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001047 if (altcol <= tok->altindstack[tok->indent]) {
1048 if (indenterror(tok))
1049 return ERRORTOKEN;
1050 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001051 tok->pendin++;
1052 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001053 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001055 else /* col < tok->indstack[tok->indent] */ {
1056 /* Dedent -- any number, must be consistent */
1057 while (tok->indent > 0 &&
1058 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001059 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001060 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001061 }
1062 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001063 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001064 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001065 return ERRORTOKEN;
1066 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001067 if (altcol != tok->altindstack[tok->indent]) {
1068 if (indenterror(tok))
1069 return ERRORTOKEN;
1070 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001071 }
1072 }
1073 }
1074
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001075 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076
1077 /* Return pending indents/dedents */
1078 if (tok->pendin != 0) {
1079 if (tok->pendin < 0) {
1080 tok->pendin++;
1081 return DEDENT;
1082 }
1083 else {
1084 tok->pendin--;
1085 return INDENT;
1086 }
1087 }
1088
1089 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001090 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 /* Skip spaces */
1092 do {
1093 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001094 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001095
1096 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001097 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098
Guido van Rossumab5ca152000-03-31 00:52:27 +00001099 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001100 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001101 static char *tabforms[] = {
1102 "tab-width:", /* Emacs */
1103 ":tabstop=", /* vim, full form */
1104 ":ts=", /* vim, abbreviated form */
1105 "set tabsize=", /* will vi never die? */
1106 /* more templates can be added here to support other editors */
1107 };
1108 char cbuf[80];
1109 char *tp, **cp;
1110 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001111 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001112 *tp++ = c = tok_nextc(tok);
1113 } while (c != EOF && c != '\n' &&
1114 tp - cbuf + 1 < sizeof(cbuf));
1115 *tp = '\0';
1116 for (cp = tabforms;
1117 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1118 cp++) {
1119 if ((tp = strstr(cbuf, *cp))) {
1120 int newsize = atoi(tp + strlen(*cp));
1121
1122 if (newsize >= 1 && newsize <= 40) {
1123 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001124 if (Py_VerboseFlag)
1125 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001126 "Tab size set to %d\n",
1127 newsize);
1128 }
1129 }
1130 }
1131 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001132 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001133 }
1134
1135 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001136 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001138 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001139
1140 /* Identifier (most frequent token!) */
1141 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001142 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001143 switch (c) {
1144 case 'r':
1145 case 'R':
1146 c = tok_nextc(tok);
1147 if (c == '"' || c == '\'')
1148 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001149 break;
1150 case 'u':
1151 case 'U':
1152 c = tok_nextc(tok);
1153 if (c == 'r' || c == 'R')
1154 c = tok_nextc(tok);
1155 if (c == '"' || c == '\'')
1156 goto letter_quote;
1157 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001158 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001159 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001161 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001162 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001163 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164 *p_end = tok->cur;
1165 return NAME;
1166 }
1167
1168 /* Newline */
1169 if (c == '\n') {
1170 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001171 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001173 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001174 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001175 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 return NEWLINE;
1177 }
1178
Guido van Rossum2d45be11997-04-11 19:16:25 +00001179#ifdef macintosh
1180 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001181 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001182 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001183 tok->done = E_TOKEN;
1184 tok->cur = tok->inp;
1185 return ERRORTOKEN;
1186 }
1187#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001188 /* Period or number starting with period? */
1189 if (c == '.') {
1190 c = tok_nextc(tok);
1191 if (isdigit(c)) {
1192 goto fraction;
1193 }
1194 else {
1195 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001196 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001197 *p_end = tok->cur;
1198 return DOT;
1199 }
1200 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001201
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 /* Number */
1203 if (isdigit(c)) {
1204 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001205 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001206 c = tok_nextc(tok);
1207 if (c == '.')
1208 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001209#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001210 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001211 goto imaginary;
1212#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 if (c == 'x' || c == 'X') {
1214 /* Hex */
1215 do {
1216 c = tok_nextc(tok);
1217 } while (isxdigit(c));
1218 }
1219 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001220 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 /* Octal; c is first char of it */
1222 /* There's no 'isoctdigit' macro, sigh */
1223 while ('0' <= c && c < '8') {
1224 c = tok_nextc(tok);
1225 }
Tim Petersd507dab2001-08-30 20:51:59 +00001226 if (isdigit(c)) {
1227 found_decimal = 1;
1228 do {
1229 c = tok_nextc(tok);
1230 } while (isdigit(c));
1231 }
1232 if (c == '.')
1233 goto fraction;
1234 else if (c == 'e' || c == 'E')
1235 goto exponent;
1236#ifndef WITHOUT_COMPLEX
1237 else if (c == 'j' || c == 'J')
1238 goto imaginary;
1239#endif
1240 else if (found_decimal) {
1241 tok->done = E_TOKEN;
1242 tok_backup(tok, c);
1243 return ERRORTOKEN;
1244 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001246 if (c == 'l' || c == 'L')
1247 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 }
1249 else {
1250 /* Decimal */
1251 do {
1252 c = tok_nextc(tok);
1253 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001254 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001256 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001257 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001258 if (c == '.') {
1259 fraction:
1260 /* Fraction */
1261 do {
1262 c = tok_nextc(tok);
1263 } while (isdigit(c));
1264 }
1265 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001266 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001267 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001268 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001269 if (c == '+' || c == '-')
1270 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001271 if (!isdigit(c)) {
1272 tok->done = E_TOKEN;
1273 tok_backup(tok, c);
1274 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001275 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001276 do {
1277 c = tok_nextc(tok);
1278 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001280#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001281 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001282 /* Imaginary part */
1283 imaginary:
1284 c = tok_nextc(tok);
1285#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 }
1287 }
1288 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001289 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 *p_end = tok->cur;
1291 return NUMBER;
1292 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001293
1294 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001295 /* String */
1296 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001297 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001298 int quote = c;
1299 int triple = 0;
1300 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 for (;;) {
1302 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001303 if (c == '\n') {
1304 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001305 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001306 tok_backup(tok, c);
1307 return ERRORTOKEN;
1308 }
1309 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001310 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001311 }
1312 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001313 if (triple)
1314 tok->done = E_EOFS;
1315 else
1316 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001317 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 return ERRORTOKEN;
1319 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001320 else if (c == quote) {
1321 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001322 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001323 c = tok_nextc(tok);
1324 if (c == quote) {
1325 triple = 1;
1326 tripcount = 0;
1327 continue;
1328 }
1329 tok_backup(tok, c);
1330 }
1331 if (!triple || tripcount == 3)
1332 break;
1333 }
1334 else if (c == '\\') {
1335 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001337 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001338 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001339 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001340 return ERRORTOKEN;
1341 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001343 else
1344 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001346 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001347 *p_end = tok->cur;
1348 return STRING;
1349 }
1350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 /* Line continuation */
1352 if (c == '\\') {
1353 c = tok_nextc(tok);
1354 if (c != '\n') {
1355 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001356 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 return ERRORTOKEN;
1358 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001359 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 goto again; /* Read next line */
1361 }
1362
Guido van Rossumfbab9051991-10-20 20:25:03 +00001363 /* Check for two-character token */
1364 {
1365 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001366 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001367 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001368 int c3 = tok_nextc(tok);
1369 int token3 = PyToken_ThreeChars(c, c2, c3);
1370 if (token3 != OP) {
1371 token = token3;
1372 } else {
1373 tok_backup(tok, c3);
1374 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001375 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001376 *p_end = tok->cur;
1377 return token;
1378 }
1379 tok_backup(tok, c2);
1380 }
1381
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001383 switch (c) {
1384 case '(':
1385 case '[':
1386 case '{':
1387 tok->level++;
1388 break;
1389 case ')':
1390 case ']':
1391 case '}':
1392 tok->level--;
1393 break;
1394 }
1395
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001396 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001397 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001398 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001399 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001400}
1401
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001402int
1403PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1404{
1405 int result = tok_get(tok, p_start, p_end);
1406 if (tok->decoding_erred) {
1407 result = ERRORTOKEN;
1408 tok->done = E_DECODE;
1409 }
1410 return result;
1411}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412
Guido van Rossum408027e1996-12-30 16:17:54 +00001413#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414
1415void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001416tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001417{
Guido van Rossum86bea461997-04-29 21:03:06 +00001418 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1420 printf("(%.*s)", (int)(end - start), start);
1421}
1422
1423#endif