blob: 4119c43d5d3eeb05431341036cf7108df107fdda [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000131#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->decoding_readline = NULL;
133 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135 return tok;
136}
137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138#ifdef PGEN
139
140static char *
141decoding_fgets(char *s, int size, struct tok_state *tok)
142{
143 return fgets(s, size, tok->fp);
144}
145
146static int
147decoding_feof(struct tok_state *tok)
148{
149 return feof(tok->fp);
150}
151
152static const char *
153decode_str(const char *str, struct tok_state *tok)
154{
155 return str;
156}
157
158#else /* PGEN */
159
160static char *
161error_ret(struct tok_state *tok) /* XXX */
162{
163 tok->decoding_erred = 1;
164 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
165 PyMem_DEL(tok->buf);
166 tok->buf = NULL;
167 return NULL; /* as if it were EOF */
168}
169
170static char *
171new_string(const char *s, int len)
172{
173 char* result = PyMem_NEW(char, len + 1);
174 if (result != NULL) {
175 memcpy(result, s, len);
176 result[len] = '\0';
177 }
178 return result;
179}
180
181static char *
182get_normal_name(char *s) /* for utf-8 and latin-1 */
183{
184 char buf[13];
185 int i;
186 for (i = 0; i < 12; i++) {
187 int c = s[i];
188 if (c == '\0') break;
189 else if (c == '_') buf[i] = '-';
190 else buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
201 else return s;
202}
203
204/* Return the coding spec in S, or NULL if none is found. */
205
206static char *
207get_coding_spec(const char *s, int size)
208{
209 int i;
210 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
211 const char* t = s + i;
212 if (strncmp(t, "coding", 6) == 0) {
213 const char* begin = NULL;
214 t += 6;
215 if (t[0] != ':' && t[0] != '=')
216 continue;
217 do {
218 t++;
219 } while (t[0] == '\x20' || t[0] == '\t');
220
221 begin = t;
222 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
223 t[0] == '.')
224 t++;
225
226 if (begin < t) {
227 char* r = new_string(begin, t - begin);
228 char* q = get_normal_name(r);
229 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000230 PyMem_DEL(r);
231 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232 }
233 return r;
234 }
235 }
236 }
237 return NULL;
238}
239
240/* Check whether the line contains a coding spec. If it does,
241 invoke the set_readline function for the new encoding.
242 This function receives the tok_state and the new encoding.
243 Return 1 on success, 0 on failure. */
244
245static int
246check_coding_spec(const char* line, int size, struct tok_state *tok,
247 int set_readline(struct tok_state *, const char *))
248{
249 int r = 1;
250 char* cs = get_coding_spec(line, size);
251 if (cs != NULL) {
252 tok->read_coding_spec = 1;
253 if (tok->encoding == NULL) {
254 assert(tok->decoding_state == 1); /* raw */
255 if (strcmp(cs, "utf-8") == 0 ||
256 strcmp(cs, "iso-8859-1") == 0) {
257 tok->encoding = cs;
258 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000259#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 r = set_readline(tok, cs);
261 if (r) {
262 tok->encoding = cs;
263 tok->decoding_state = -1;
264 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000265#else
266 /* Without Unicode support, we cannot
267 process the coding spec. Since there
268 won't be any Unicode literals, that
269 won't matter. */
270#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 }
272 } else { /* then, compare cs with BOM */
273 r = (strcmp(tok->encoding, cs) == 0);
274 PyMem_DEL(cs);
275 }
276 }
277 return r;
278}
279
280/* See whether the file starts with a BOM. If it does,
281 invoke the set_readline function with the new encoding.
282 Return 1 on success, 0 on failure. */
283
284static int
285check_bom(int get_char(struct tok_state *),
286 void unget_char(int, struct tok_state *),
287 int set_readline(struct tok_state *, const char *),
288 struct tok_state *tok)
289{
290 int ch = get_char(tok);
291 tok->decoding_state = 1;
292 if (ch == EOF) {
293 return 1;
294 } else if (ch == 0xEF) {
295 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
296 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
297#if 0
298 /* Disable support for UTF-16 BOMs until a decision
299 is made whether this needs to be supported. */
300 } else if (ch == 0xFE) {
301 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
302 if (!set_readline(tok, "utf-16-be")) return 0;
303 tok->decoding_state = -1;
304 } else if (ch == 0xFF) {
305 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
306 if (!set_readline(tok, "utf-16-le")) return 0;
307 tok->decoding_state = -1;
308#endif
309 } else {
310 unget_char(ch, tok);
311 return 1;
312 }
313 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
314 return 1;
315 NON_BOM:
316 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
317 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
318 return 1;
319}
320
321/* Read a line of text from TOK into S, using the stream in TOK.
322 Return NULL on failure, else S. */
323
324static char *
325fp_readl(char *s, int size, struct tok_state *tok)
326{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000327#ifndef Py_USING_UNICODE
328 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000329 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000330 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000331#else
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 PyObject* utf8;
333 PyObject* buf = tok->decoding_buffer;
334 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000335 PyObject *args = PyTuple_New(0);
336 if (args == NULL)
337 return error_ret(tok);
338 buf = PyObject_Call(tok->decoding_readline, args, NULL);
339 Py_DECREF(args);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000340 if (buf == NULL)
341 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000342 } else {
343 tok->decoding_buffer = NULL;
344 }
345 utf8 = PyUnicode_AsUTF8String(buf);
346 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000347 if (utf8 == NULL)
348 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349 else {
350 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000351 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352 strcpy(s, str);
353 Py_DECREF(utf8);
354 if (s[0] == '\0') return NULL; /* EOF */
355 return s;
356 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000357#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358}
359
360/* Set the readline function for TOK to a StreamReader's
361 readline function. The StreamReader is named ENC.
362
363 This function is called from check_bom and check_coding_spec.
364
365 ENC is usually identical to the future value of tok->encoding,
366 except for the (currently unsupported) case of UTF-16.
367
368 Return 1 on success, 0 on failure. */
369
370static int
371fp_setreadl(struct tok_state *tok, const char* enc)
372{
373 PyObject *reader, *stream, *readline;
374
375 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000376 if (stream == NULL)
377 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378
379 reader = PyCodec_StreamReader(enc, stream, NULL);
380 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000381 if (reader == NULL)
382 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383
384 readline = PyObject_GetAttrString(reader, "readline");
385 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000386 if (readline == NULL)
387 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000388
389 tok->decoding_readline = readline;
390 return 1;
391}
392
393/* Fetch the next byte from TOK. */
394
395static int fp_getc(struct tok_state *tok) {
396 return getc(tok->fp);
397}
398
399/* Unfetch the last byte back into TOK. */
400
401static void fp_ungetc(int c, struct tok_state *tok) {
402 ungetc(c, tok->fp);
403}
404
405/* Read a line of input from TOK. Determine encoding
406 if necessary. */
407
408static char *
409decoding_fgets(char *s, int size, struct tok_state *tok)
410{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000411 char *line = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000413 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414 if (tok->decoding_state < 0) {
415 /* We already have a codec associated with
416 this input. */
417 line = fp_readl(s, size, tok);
418 break;
419 } else if (tok->decoding_state > 0) {
420 /* We want a 'raw' read. */
421 line = Py_UniversalNewlineFgets(s, size,
422 tok->fp, NULL);
423 warn = 1;
424 break;
425 } else {
426 /* We have not yet determined the encoding.
427 If an encoding is found, use the file-pointer
428 reader functions from now on. */
429 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
430 return error_ret(tok);
431 assert(tok->decoding_state != 0);
432 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000433 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000434 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
435 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
436 return error_ret(tok);
437 }
438 }
439#ifndef PGEN
440 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
441 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000442 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443 if (*c > 127) {
444 badchar = *c;
445 break;
446 }
447 }
448 if (badchar) {
449 char buf[200];
450 sprintf(buf, "Non-ASCII character '\\x%.2x', "
451 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000452 /* Need to add 1 to the line number, since this line
453 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000455 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000456 NULL, NULL);
457 tok->issued_encoding_warning = 1;
458 }
459#endif
460 return line;
461}
462
463static int
464decoding_feof(struct tok_state *tok)
465{
466 if (tok->decoding_state >= 0) {
467 return feof(tok->fp);
468 } else {
469 PyObject* buf = tok->decoding_buffer;
470 if (buf == NULL) {
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000471 PyObject *args = PyTuple_New(0);
472 if (args == NULL) {
473 error_ret(tok);
474 return 1;
475 }
476 buf = PyObject_Call(tok->decoding_readline,
477 args, NULL);
478 Py_DECREF(args);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 if (buf == NULL) {
480 error_ret(tok);
481 return 1;
482 } else {
483 tok->decoding_buffer = buf;
484 }
485 }
486 return PyObject_Length(buf) == 0;
487 }
488}
489
490/* Fetch a byte from TOK, using the string buffer. */
491
492static int buf_getc(struct tok_state *tok) {
493 return *tok->str++;
494}
495
496/* Unfetch a byte from TOK, using the string buffer. */
497
498static void buf_ungetc(int c, struct tok_state *tok) {
499 tok->str--;
500 assert(*tok->str == c); /* tok->cur may point to read-only segment */
501}
502
503/* Set the readline function for TOK to ENC. For the string-based
504 tokenizer, this means to just record the encoding. */
505
506static int buf_setreadl(struct tok_state *tok, const char* enc) {
507 tok->enc = enc;
508 return 1;
509}
510
511/* Return a UTF-8 encoding Python string object from the
512 C byte string STR, which is encoded with ENC. */
513
Martin v. Löwis019934b2002-08-07 12:33:18 +0000514#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515static PyObject *
516translate_into_utf8(const char* str, const char* enc) {
517 PyObject *utf8;
518 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
519 if (buf == NULL)
520 return NULL;
521 utf8 = PyUnicode_AsUTF8String(buf);
522 Py_DECREF(buf);
523 return utf8;
524}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000525#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526
527/* Decode a byte string STR for use as the buffer of TOK.
528 Look for encoding declarations inside STR, and record them
529 inside TOK. */
530
531static const char *
532decode_str(const char *str, struct tok_state *tok)
533{
534 PyObject* utf8 = NULL;
535 const char *s;
536 int lineno = 0;
537 tok->enc = NULL;
538 tok->str = str;
539 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
540 return NULL;
541 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000542 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000543#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544 if (tok->enc != NULL) {
545 utf8 = translate_into_utf8(str, tok->enc);
546 if (utf8 == NULL)
547 return NULL;
548 str = PyString_AsString(utf8);
549 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000550#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 for (s = str;; s++) {
552 if (*s == '\0') break;
553 else if (*s == '\n') {
554 lineno++;
555 if (lineno == 2) break;
556 }
557 }
558 tok->enc = NULL;
559 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
560 return NULL;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000561#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562 if (tok->enc != NULL) {
563 assert(utf8 == NULL);
564 utf8 = translate_into_utf8(str, tok->enc);
565 if (utf8 == NULL)
566 return NULL;
567 str = PyString_AsString(utf8);
568 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000569#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000570 assert(tok->decoding_buffer == NULL);
571 tok->decoding_buffer = utf8; /* CAUTION */
572 return str;
573}
574
575#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000576
577/* Set up tokenizer for string */
578
579struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000580PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000581{
582 struct tok_state *tok = tok_new();
583 if (tok == NULL)
584 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 str = (char *)decode_str(str, tok);
586 if (str == NULL)
587 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000588 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000589 return tok;
590}
591
592
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000593/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000594
595struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000596PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000597{
598 struct tok_state *tok = tok_new();
599 if (tok == NULL)
600 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000601 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
602 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603 return NULL;
604 }
605 tok->cur = tok->inp = tok->buf;
606 tok->end = tok->buf + BUFSIZ;
607 tok->fp = fp;
608 tok->prompt = ps1;
609 tok->nextprompt = ps2;
610 return tok;
611}
612
613
614/* Free a tok_state structure */
615
616void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000617PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000618{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 if (tok->encoding != NULL)
620 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000621#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 Py_XDECREF(tok->decoding_readline);
623 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000624#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000626 PyMem_DEL(tok->buf);
627 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000628}
629
630
631/* Get next char, updating state; error code goes into tok->done */
632
633static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000634tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000635{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000637 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000638 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000639 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000640 if (tok->done != E_OK)
641 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000642 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000643 char *end = strchr(tok->inp, '\n');
644 if (end != NULL)
645 end++;
646 else {
647 end = strchr(tok->inp, '\0');
648 if (end == tok->inp) {
649 tok->done = E_EOF;
650 return EOF;
651 }
652 }
653 if (tok->start == NULL)
654 tok->buf = tok->cur;
655 tok->lineno++;
656 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000657 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000660 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661 if (tok->nextprompt != NULL)
662 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000663 if (new == NULL)
664 tok->done = E_INTR;
665 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000666 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 tok->done = E_EOF;
668 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000669 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000670 size_t start = tok->start - tok->buf;
671 size_t oldlen = tok->cur - tok->buf;
672 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000673 char *buf = tok->buf;
674 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000675 tok->lineno++;
676 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000677 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000678 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000679 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000680 tok->done = E_NOMEM;
681 return EOF;
682 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000683 tok->buf = buf;
684 tok->cur = tok->buf + oldlen;
685 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000686 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000687 tok->inp = tok->buf + newlen;
688 tok->end = tok->inp + 1;
689 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000690 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000691 else {
692 tok->lineno++;
693 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000694 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000695 tok->buf = new;
696 tok->cur = tok->buf;
697 tok->inp = strchr(tok->buf, '\0');
698 tok->end = tok->inp + 1;
699 }
700 }
701 else {
702 int done = 0;
703 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000704 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000705 if (tok->start == NULL) {
706 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000707 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000708 if (tok->buf == NULL) {
709 tok->done = E_NOMEM;
710 return EOF;
711 }
712 tok->end = tok->buf + BUFSIZ;
713 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
715 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000716 tok->done = E_EOF;
717 done = 1;
718 }
719 else {
720 tok->done = E_OK;
721 tok->inp = strchr(tok->buf, '\0');
722 done = tok->inp[-1] == '\n';
723 }
724 }
725 else {
726 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000727 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000728 tok->done = E_EOF;
729 done = 1;
730 }
731 else
732 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000733 }
734 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000735 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000736 while (!done) {
737 int curstart = tok->start == NULL ? -1 :
738 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000739 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000740 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000741 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000742 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000743 if (newbuf == NULL) {
744 tok->done = E_NOMEM;
745 tok->cur = tok->inp;
746 return EOF;
747 }
748 tok->buf = newbuf;
749 tok->inp = tok->buf + curvalid;
750 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000751 tok->start = curstart < 0 ? NULL :
752 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000753 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000754 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000755 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000756 /* Last line does not end in \n,
757 fake one */
758 strcpy(tok->inp, "\n");
759 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000760 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000761 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000762 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000763 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000764#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000765 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000766 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000767 pt = tok->inp - 2;
768 if (pt >= tok->buf && *pt == '\r') {
769 *pt++ = '\n';
770 *pt = '\0';
771 tok->inp = pt;
772 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000773#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 }
775 if (tok->done != E_OK) {
776 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000777 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000778 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 return EOF;
780 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000782 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783}
784
785
786/* Back-up one character */
787
788static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000789tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000790{
791 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000792 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000793 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000794 if (*tok->cur != c)
795 *tok->cur = c;
796 }
797}
798
799
800/* Return the token corresponding to a single character */
801
802int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000803PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000804{
805 switch (c) {
806 case '(': return LPAR;
807 case ')': return RPAR;
808 case '[': return LSQB;
809 case ']': return RSQB;
810 case ':': return COLON;
811 case ',': return COMMA;
812 case ';': return SEMI;
813 case '+': return PLUS;
814 case '-': return MINUS;
815 case '*': return STAR;
816 case '/': return SLASH;
817 case '|': return VBAR;
818 case '&': return AMPER;
819 case '<': return LESS;
820 case '>': return GREATER;
821 case '=': return EQUAL;
822 case '.': return DOT;
823 case '%': return PERCENT;
824 case '`': return BACKQUOTE;
825 case '{': return LBRACE;
826 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000827 case '^': return CIRCUMFLEX;
828 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829 default: return OP;
830 }
831}
832
833
Guido van Rossumfbab9051991-10-20 20:25:03 +0000834int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000835PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000836{
837 switch (c1) {
838 case '=':
839 switch (c2) {
840 case '=': return EQEQUAL;
841 }
842 break;
843 case '!':
844 switch (c2) {
845 case '=': return NOTEQUAL;
846 }
847 break;
848 case '<':
849 switch (c2) {
850 case '>': return NOTEQUAL;
851 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000852 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000853 }
854 break;
855 case '>':
856 switch (c2) {
857 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000858 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000859 }
860 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000861 case '+':
862 switch (c2) {
863 case '=': return PLUSEQUAL;
864 }
865 break;
866 case '-':
867 switch (c2) {
868 case '=': return MINEQUAL;
869 }
870 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000871 case '*':
872 switch (c2) {
873 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000874 case '=': return STAREQUAL;
875 }
876 break;
877 case '/':
878 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000879 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000880 case '=': return SLASHEQUAL;
881 }
882 break;
883 case '|':
884 switch (c2) {
885 case '=': return VBAREQUAL;
886 }
887 break;
888 case '%':
889 switch (c2) {
890 case '=': return PERCENTEQUAL;
891 }
892 break;
893 case '&':
894 switch (c2) {
895 case '=': return AMPEREQUAL;
896 }
897 break;
898 case '^':
899 switch (c2) {
900 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000901 }
902 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000903 }
904 return OP;
905}
906
Thomas Wouters434d0822000-08-24 20:11:32 +0000907int
908PyToken_ThreeChars(int c1, int c2, int c3)
909{
910 switch (c1) {
911 case '<':
912 switch (c2) {
913 case '<':
914 switch (c3) {
915 case '=':
916 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000917 }
918 break;
919 }
920 break;
921 case '>':
922 switch (c2) {
923 case '>':
924 switch (c3) {
925 case '=':
926 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000927 }
928 break;
929 }
930 break;
931 case '*':
932 switch (c2) {
933 case '*':
934 switch (c3) {
935 case '=':
936 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000937 }
938 break;
939 }
940 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000941 case '/':
942 switch (c2) {
943 case '/':
944 switch (c3) {
945 case '=':
946 return DOUBLESLASHEQUAL;
947 }
948 break;
949 }
950 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000951 }
952 return OP;
953}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000954
Guido van Rossum926f13a1998-04-09 21:38:06 +0000955static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000956indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000957{
958 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000959 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000960 tok->cur = tok->inp;
961 return 1;
962 }
963 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000964 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
965 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000966 tok->altwarning = 0;
967 }
968 return 0;
969}
970
971
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000972/* Get next token, after space stripping etc. */
973
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000974static int
975tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000976{
977 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000978 int blankline;
979
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000980 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000981 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000982 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000983 blankline = 0;
984
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000985 /* Get indentation level */
986 if (tok->atbol) {
987 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000988 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000989 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990 for (;;) {
991 c = tok_nextc(tok);
992 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000993 col++, altcol++;
994 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000996 altcol = (altcol/tok->alttabsize + 1)
997 * tok->alttabsize;
998 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000999 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001000 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001001 else
1002 break;
1003 }
1004 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001005 if (c == '#' || c == '\n') {
1006 /* Lines with only whitespace and/or comments
1007 shouldn't affect the indentation and are
1008 not passed to the parser as NEWLINE tokens,
1009 except *totally* empty lines in interactive
1010 mode, which signal the end of a command group. */
1011 if (col == 0 && c == '\n' && tok->prompt != NULL)
1012 blankline = 0; /* Let it through */
1013 else
1014 blankline = 1; /* Ignore completely */
1015 /* We can't jump back right here since we still
1016 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001017 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001018 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001019 if (col == tok->indstack[tok->indent]) {
1020 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001021 if (altcol != tok->altindstack[tok->indent]) {
1022 if (indenterror(tok))
1023 return ERRORTOKEN;
1024 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001026 else if (col > tok->indstack[tok->indent]) {
1027 /* Indent -- always one */
1028 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001029 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001030 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001031 return ERRORTOKEN;
1032 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001033 if (altcol <= tok->altindstack[tok->indent]) {
1034 if (indenterror(tok))
1035 return ERRORTOKEN;
1036 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001037 tok->pendin++;
1038 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001039 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001040 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001041 else /* col < tok->indstack[tok->indent] */ {
1042 /* Dedent -- any number, must be consistent */
1043 while (tok->indent > 0 &&
1044 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001045 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001046 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001047 }
1048 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001049 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001050 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001051 return ERRORTOKEN;
1052 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001053 if (altcol != tok->altindstack[tok->indent]) {
1054 if (indenterror(tok))
1055 return ERRORTOKEN;
1056 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057 }
1058 }
1059 }
1060
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001061 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001062
1063 /* Return pending indents/dedents */
1064 if (tok->pendin != 0) {
1065 if (tok->pendin < 0) {
1066 tok->pendin++;
1067 return DEDENT;
1068 }
1069 else {
1070 tok->pendin--;
1071 return INDENT;
1072 }
1073 }
1074
1075 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001076 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001077 /* Skip spaces */
1078 do {
1079 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001080 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001081
1082 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001083 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001084
Guido van Rossumab5ca152000-03-31 00:52:27 +00001085 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001087 static char *tabforms[] = {
1088 "tab-width:", /* Emacs */
1089 ":tabstop=", /* vim, full form */
1090 ":ts=", /* vim, abbreviated form */
1091 "set tabsize=", /* will vi never die? */
1092 /* more templates can be added here to support other editors */
1093 };
1094 char cbuf[80];
1095 char *tp, **cp;
1096 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001098 *tp++ = c = tok_nextc(tok);
1099 } while (c != EOF && c != '\n' &&
1100 tp - cbuf + 1 < sizeof(cbuf));
1101 *tp = '\0';
1102 for (cp = tabforms;
1103 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1104 cp++) {
1105 if ((tp = strstr(cbuf, *cp))) {
1106 int newsize = atoi(tp + strlen(*cp));
1107
1108 if (newsize >= 1 && newsize <= 40) {
1109 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001110 if (Py_VerboseFlag)
1111 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001112 "Tab size set to %d\n",
1113 newsize);
1114 }
1115 }
1116 }
1117 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119 }
1120
1121 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001122 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001123 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001124 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001125
1126 /* Identifier (most frequent token!) */
1127 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001128 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001129 switch (c) {
1130 case 'r':
1131 case 'R':
1132 c = tok_nextc(tok);
1133 if (c == '"' || c == '\'')
1134 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001135 break;
1136 case 'u':
1137 case 'U':
1138 c = tok_nextc(tok);
1139 if (c == 'r' || c == 'R')
1140 c = tok_nextc(tok);
1141 if (c == '"' || c == '\'')
1142 goto letter_quote;
1143 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001144 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001145 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001147 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001148 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001149 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001150 *p_end = tok->cur;
1151 return NAME;
1152 }
1153
1154 /* Newline */
1155 if (c == '\n') {
1156 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001157 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001159 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1161 return NEWLINE;
1162 }
1163
Guido van Rossum2d45be11997-04-11 19:16:25 +00001164#ifdef macintosh
1165 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001166 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001167 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001168 tok->done = E_TOKEN;
1169 tok->cur = tok->inp;
1170 return ERRORTOKEN;
1171 }
1172#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001173 /* Period or number starting with period? */
1174 if (c == '.') {
1175 c = tok_nextc(tok);
1176 if (isdigit(c)) {
1177 goto fraction;
1178 }
1179 else {
1180 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001181 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001182 *p_end = tok->cur;
1183 return DOT;
1184 }
1185 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001186
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187 /* Number */
1188 if (isdigit(c)) {
1189 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001190 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191 c = tok_nextc(tok);
1192 if (c == '.')
1193 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001194#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001195 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001196 goto imaginary;
1197#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 if (c == 'x' || c == 'X') {
1199 /* Hex */
1200 do {
1201 c = tok_nextc(tok);
1202 } while (isxdigit(c));
1203 }
1204 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001205 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001206 /* Octal; c is first char of it */
1207 /* There's no 'isoctdigit' macro, sigh */
1208 while ('0' <= c && c < '8') {
1209 c = tok_nextc(tok);
1210 }
Tim Petersd507dab2001-08-30 20:51:59 +00001211 if (isdigit(c)) {
1212 found_decimal = 1;
1213 do {
1214 c = tok_nextc(tok);
1215 } while (isdigit(c));
1216 }
1217 if (c == '.')
1218 goto fraction;
1219 else if (c == 'e' || c == 'E')
1220 goto exponent;
1221#ifndef WITHOUT_COMPLEX
1222 else if (c == 'j' || c == 'J')
1223 goto imaginary;
1224#endif
1225 else if (found_decimal) {
1226 tok->done = E_TOKEN;
1227 tok_backup(tok, c);
1228 return ERRORTOKEN;
1229 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001230 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001231 if (c == 'l' || c == 'L')
1232 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 }
1234 else {
1235 /* Decimal */
1236 do {
1237 c = tok_nextc(tok);
1238 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001239 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001241 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001242 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001243 if (c == '.') {
1244 fraction:
1245 /* Fraction */
1246 do {
1247 c = tok_nextc(tok);
1248 } while (isdigit(c));
1249 }
1250 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001251 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001252 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001254 if (c == '+' || c == '-')
1255 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001256 if (!isdigit(c)) {
1257 tok->done = E_TOKEN;
1258 tok_backup(tok, c);
1259 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001260 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001261 do {
1262 c = tok_nextc(tok);
1263 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001265#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001266 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001267 /* Imaginary part */
1268 imaginary:
1269 c = tok_nextc(tok);
1270#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 }
1272 }
1273 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001274 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 *p_end = tok->cur;
1276 return NUMBER;
1277 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001278
1279 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001280 /* String */
1281 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001282 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001283 int quote = c;
1284 int triple = 0;
1285 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 for (;;) {
1287 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001288 if (c == '\n') {
1289 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001290 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001291 tok_backup(tok, c);
1292 return ERRORTOKEN;
1293 }
1294 tripcount = 0;
1295 }
1296 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001297 if (triple)
1298 tok->done = E_EOFS;
1299 else
1300 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001301 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 return ERRORTOKEN;
1303 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001304 else if (c == quote) {
1305 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001306 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001307 c = tok_nextc(tok);
1308 if (c == quote) {
1309 triple = 1;
1310 tripcount = 0;
1311 continue;
1312 }
1313 tok_backup(tok, c);
1314 }
1315 if (!triple || tripcount == 3)
1316 break;
1317 }
1318 else if (c == '\\') {
1319 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001321 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001322 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001323 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 return ERRORTOKEN;
1325 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001327 else
1328 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001330 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001331 *p_end = tok->cur;
1332 return STRING;
1333 }
1334
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 /* Line continuation */
1336 if (c == '\\') {
1337 c = tok_nextc(tok);
1338 if (c != '\n') {
1339 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001340 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 return ERRORTOKEN;
1342 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001343 goto again; /* Read next line */
1344 }
1345
Guido van Rossumfbab9051991-10-20 20:25:03 +00001346 /* Check for two-character token */
1347 {
1348 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001349 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001350 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001351 int c3 = tok_nextc(tok);
1352 int token3 = PyToken_ThreeChars(c, c2, c3);
1353 if (token3 != OP) {
1354 token = token3;
1355 } else {
1356 tok_backup(tok, c3);
1357 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001358 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001359 *p_end = tok->cur;
1360 return token;
1361 }
1362 tok_backup(tok, c2);
1363 }
1364
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001365 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001366 switch (c) {
1367 case '(':
1368 case '[':
1369 case '{':
1370 tok->level++;
1371 break;
1372 case ')':
1373 case ']':
1374 case '}':
1375 tok->level--;
1376 break;
1377 }
1378
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001380 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001381 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001382 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383}
1384
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001385int
1386PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1387{
1388 int result = tok_get(tok, p_start, p_end);
1389 if (tok->decoding_erred) {
1390 result = ERRORTOKEN;
1391 tok->done = E_DECODE;
1392 }
1393 return result;
1394}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395
Guido van Rossum408027e1996-12-30 16:17:54 +00001396#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397
1398void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001399tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001400{
Guido van Rossum86bea461997-04-29 21:03:06 +00001401 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1403 printf("(%.*s)", (int)(end - start), start);
1404}
1405
1406#endif