blob: 3761a48b7294f1a7ee6e3f3db5b6b95b6c2b9e62 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000131#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->decoding_readline = NULL;
133 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135 return tok;
136}
137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138#ifdef PGEN
139
140static char *
141decoding_fgets(char *s, int size, struct tok_state *tok)
142{
143 return fgets(s, size, tok->fp);
144}
145
146static int
147decoding_feof(struct tok_state *tok)
148{
149 return feof(tok->fp);
150}
151
152static const char *
153decode_str(const char *str, struct tok_state *tok)
154{
155 return str;
156}
157
158#else /* PGEN */
159
160static char *
161error_ret(struct tok_state *tok) /* XXX */
162{
163 tok->decoding_erred = 1;
164 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
165 PyMem_DEL(tok->buf);
166 tok->buf = NULL;
167 return NULL; /* as if it were EOF */
168}
169
170static char *
171new_string(const char *s, int len)
172{
173 char* result = PyMem_NEW(char, len + 1);
174 if (result != NULL) {
175 memcpy(result, s, len);
176 result[len] = '\0';
177 }
178 return result;
179}
180
181static char *
182get_normal_name(char *s) /* for utf-8 and latin-1 */
183{
184 char buf[13];
185 int i;
186 for (i = 0; i < 12; i++) {
187 int c = s[i];
188 if (c == '\0') break;
189 else if (c == '_') buf[i] = '-';
190 else buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
201 else return s;
202}
203
204/* Return the coding spec in S, or NULL if none is found. */
205
206static char *
207get_coding_spec(const char *s, int size)
208{
209 int i;
210 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
211 const char* t = s + i;
212 if (strncmp(t, "coding", 6) == 0) {
213 const char* begin = NULL;
214 t += 6;
215 if (t[0] != ':' && t[0] != '=')
216 continue;
217 do {
218 t++;
219 } while (t[0] == '\x20' || t[0] == '\t');
220
221 begin = t;
222 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
223 t[0] == '.')
224 t++;
225
226 if (begin < t) {
227 char* r = new_string(begin, t - begin);
228 char* q = get_normal_name(r);
229 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000230 PyMem_DEL(r);
231 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232 }
233 return r;
234 }
235 }
236 }
237 return NULL;
238}
239
240/* Check whether the line contains a coding spec. If it does,
241 invoke the set_readline function for the new encoding.
242 This function receives the tok_state and the new encoding.
243 Return 1 on success, 0 on failure. */
244
245static int
246check_coding_spec(const char* line, int size, struct tok_state *tok,
247 int set_readline(struct tok_state *, const char *))
248{
249 int r = 1;
250 char* cs = get_coding_spec(line, size);
251 if (cs != NULL) {
252 tok->read_coding_spec = 1;
253 if (tok->encoding == NULL) {
254 assert(tok->decoding_state == 1); /* raw */
255 if (strcmp(cs, "utf-8") == 0 ||
256 strcmp(cs, "iso-8859-1") == 0) {
257 tok->encoding = cs;
258 } else {
259 r = set_readline(tok, cs);
260 if (r) {
261 tok->encoding = cs;
262 tok->decoding_state = -1;
263 }
264 }
265 } else { /* then, compare cs with BOM */
266 r = (strcmp(tok->encoding, cs) == 0);
267 PyMem_DEL(cs);
268 }
269 }
270 return r;
271}
272
273/* See whether the file starts with a BOM. If it does,
274 invoke the set_readline function with the new encoding.
275 Return 1 on success, 0 on failure. */
276
277static int
278check_bom(int get_char(struct tok_state *),
279 void unget_char(int, struct tok_state *),
280 int set_readline(struct tok_state *, const char *),
281 struct tok_state *tok)
282{
283 int ch = get_char(tok);
284 tok->decoding_state = 1;
285 if (ch == EOF) {
286 return 1;
287 } else if (ch == 0xEF) {
288 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
289 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
290#if 0
291 /* Disable support for UTF-16 BOMs until a decision
292 is made whether this needs to be supported. */
293 } else if (ch == 0xFE) {
294 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
295 if (!set_readline(tok, "utf-16-be")) return 0;
296 tok->decoding_state = -1;
297 } else if (ch == 0xFF) {
298 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
299 if (!set_readline(tok, "utf-16-le")) return 0;
300 tok->decoding_state = -1;
301#endif
302 } else {
303 unget_char(ch, tok);
304 return 1;
305 }
306 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
307 return 1;
308 NON_BOM:
309 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
310 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
311 return 1;
312}
313
314/* Read a line of text from TOK into S, using the stream in TOK.
315 Return NULL on failure, else S. */
316
317static char *
318fp_readl(char *s, int size, struct tok_state *tok)
319{
320 PyObject* utf8;
321 PyObject* buf = tok->decoding_buffer;
322 if (buf == NULL) {
323 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000324 if (buf == NULL)
325 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326 } else {
327 tok->decoding_buffer = NULL;
328 }
329 utf8 = PyUnicode_AsUTF8String(buf);
330 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000331 if (utf8 == NULL)
332 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333 else {
334 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000335 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000336 strcpy(s, str);
337 Py_DECREF(utf8);
338 if (s[0] == '\0') return NULL; /* EOF */
339 return s;
340 }
341}
342
343/* Set the readline function for TOK to a StreamReader's
344 readline function. The StreamReader is named ENC.
345
346 This function is called from check_bom and check_coding_spec.
347
348 ENC is usually identical to the future value of tok->encoding,
349 except for the (currently unsupported) case of UTF-16.
350
351 Return 1 on success, 0 on failure. */
352
353static int
354fp_setreadl(struct tok_state *tok, const char* enc)
355{
356 PyObject *reader, *stream, *readline;
357
358 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000359 if (stream == NULL)
360 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362 reader = PyCodec_StreamReader(enc, stream, NULL);
363 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000364 if (reader == NULL)
365 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366
367 readline = PyObject_GetAttrString(reader, "readline");
368 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000369 if (readline == NULL)
370 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371
372 tok->decoding_readline = readline;
373 return 1;
374}
375
376/* Fetch the next byte from TOK. */
377
378static int fp_getc(struct tok_state *tok) {
379 return getc(tok->fp);
380}
381
382/* Unfetch the last byte back into TOK. */
383
384static void fp_ungetc(int c, struct tok_state *tok) {
385 ungetc(c, tok->fp);
386}
387
388/* Read a line of input from TOK. Determine encoding
389 if necessary. */
390
391static char *
392decoding_fgets(char *s, int size, struct tok_state *tok)
393{
394 char *line;
395 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000396 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397 if (tok->decoding_state < 0) {
398 /* We already have a codec associated with
399 this input. */
400 line = fp_readl(s, size, tok);
401 break;
402 } else if (tok->decoding_state > 0) {
403 /* We want a 'raw' read. */
404 line = Py_UniversalNewlineFgets(s, size,
405 tok->fp, NULL);
406 warn = 1;
407 break;
408 } else {
409 /* We have not yet determined the encoding.
410 If an encoding is found, use the file-pointer
411 reader functions from now on. */
412 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
413 return error_ret(tok);
414 assert(tok->decoding_state != 0);
415 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000416 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000417 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
418 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
419 return error_ret(tok);
420 }
421 }
422#ifndef PGEN
423 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
424 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000425 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000426 if (*c > 127) {
427 badchar = *c;
428 break;
429 }
430 }
431 if (badchar) {
432 char buf[200];
433 sprintf(buf, "Non-ASCII character '\\x%.2x', "
434 "but no declared encoding", badchar);
Martin v. Löwis725bb232002-08-05 01:49:16 +0000435 /* Need to add 1 to the line number, since this line
436 has not been counted, yet. */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437 PyErr_WarnExplicit(PyExc_DeprecationWarning,
Martin v. Löwis725bb232002-08-05 01:49:16 +0000438 buf, tok->filename, tok->lineno + 1,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000439 NULL, NULL);
440 tok->issued_encoding_warning = 1;
441 }
442#endif
443 return line;
444}
445
446static int
447decoding_feof(struct tok_state *tok)
448{
449 if (tok->decoding_state >= 0) {
450 return feof(tok->fp);
451 } else {
452 PyObject* buf = tok->decoding_buffer;
453 if (buf == NULL) {
454 buf = PyObject_CallObject(tok->decoding_readline, NULL);
455 if (buf == NULL) {
456 error_ret(tok);
457 return 1;
458 } else {
459 tok->decoding_buffer = buf;
460 }
461 }
462 return PyObject_Length(buf) == 0;
463 }
464}
465
466/* Fetch a byte from TOK, using the string buffer. */
467
468static int buf_getc(struct tok_state *tok) {
469 return *tok->str++;
470}
471
472/* Unfetch a byte from TOK, using the string buffer. */
473
474static void buf_ungetc(int c, struct tok_state *tok) {
475 tok->str--;
476 assert(*tok->str == c); /* tok->cur may point to read-only segment */
477}
478
479/* Set the readline function for TOK to ENC. For the string-based
480 tokenizer, this means to just record the encoding. */
481
482static int buf_setreadl(struct tok_state *tok, const char* enc) {
483 tok->enc = enc;
484 return 1;
485}
486
487/* Return a UTF-8 encoding Python string object from the
488 C byte string STR, which is encoded with ENC. */
489
490static PyObject *
491translate_into_utf8(const char* str, const char* enc) {
492 PyObject *utf8;
493 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
494 if (buf == NULL)
495 return NULL;
496 utf8 = PyUnicode_AsUTF8String(buf);
497 Py_DECREF(buf);
498 return utf8;
499}
500
501/* Decode a byte string STR for use as the buffer of TOK.
502 Look for encoding declarations inside STR, and record them
503 inside TOK. */
504
505static const char *
506decode_str(const char *str, struct tok_state *tok)
507{
508 PyObject* utf8 = NULL;
509 const char *s;
510 int lineno = 0;
511 tok->enc = NULL;
512 tok->str = str;
513 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
514 return NULL;
515 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000516 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517 if (tok->enc != NULL) {
518 utf8 = translate_into_utf8(str, tok->enc);
519 if (utf8 == NULL)
520 return NULL;
521 str = PyString_AsString(utf8);
522 }
523 for (s = str;; s++) {
524 if (*s == '\0') break;
525 else if (*s == '\n') {
526 lineno++;
527 if (lineno == 2) break;
528 }
529 }
530 tok->enc = NULL;
531 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
532 return NULL;
533 if (tok->enc != NULL) {
534 assert(utf8 == NULL);
535 utf8 = translate_into_utf8(str, tok->enc);
536 if (utf8 == NULL)
537 return NULL;
538 str = PyString_AsString(utf8);
539 }
540 assert(tok->decoding_buffer == NULL);
541 tok->decoding_buffer = utf8; /* CAUTION */
542 return str;
543}
544
545#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000546
547/* Set up tokenizer for string */
548
549struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000550PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000551{
552 struct tok_state *tok = tok_new();
553 if (tok == NULL)
554 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 str = (char *)decode_str(str, tok);
556 if (str == NULL)
557 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000558 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000559 return tok;
560}
561
562
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000563/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000564
565struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000566PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000567{
568 struct tok_state *tok = tok_new();
569 if (tok == NULL)
570 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000571 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
572 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000573 return NULL;
574 }
575 tok->cur = tok->inp = tok->buf;
576 tok->end = tok->buf + BUFSIZ;
577 tok->fp = fp;
578 tok->prompt = ps1;
579 tok->nextprompt = ps2;
580 return tok;
581}
582
583
584/* Free a tok_state structure */
585
586void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000587PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000588{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 if (tok->encoding != NULL)
590 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000591#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 Py_XDECREF(tok->decoding_readline);
593 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000594#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000595 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000596 PyMem_DEL(tok->buf);
597 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000598}
599
600
601/* Get next char, updating state; error code goes into tok->done */
602
603static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000604tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000605{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000606 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000607 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000608 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000609 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000610 if (tok->done != E_OK)
611 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000612 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000613 char *end = strchr(tok->inp, '\n');
614 if (end != NULL)
615 end++;
616 else {
617 end = strchr(tok->inp, '\0');
618 if (end == tok->inp) {
619 tok->done = E_EOF;
620 return EOF;
621 }
622 }
623 if (tok->start == NULL)
624 tok->buf = tok->cur;
625 tok->lineno++;
626 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000627 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000628 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000629 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000630 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 if (tok->nextprompt != NULL)
632 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000633 if (new == NULL)
634 tok->done = E_INTR;
635 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000636 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000637 tok->done = E_EOF;
638 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000639 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000640 size_t start = tok->start - tok->buf;
641 size_t oldlen = tok->cur - tok->buf;
642 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000643 char *buf = tok->buf;
644 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000645 tok->lineno++;
646 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000647 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000648 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000649 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000650 tok->done = E_NOMEM;
651 return EOF;
652 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000653 tok->buf = buf;
654 tok->cur = tok->buf + oldlen;
655 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000656 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000657 tok->inp = tok->buf + newlen;
658 tok->end = tok->inp + 1;
659 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000660 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000661 else {
662 tok->lineno++;
663 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000664 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000665 tok->buf = new;
666 tok->cur = tok->buf;
667 tok->inp = strchr(tok->buf, '\0');
668 tok->end = tok->inp + 1;
669 }
670 }
671 else {
672 int done = 0;
673 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000674 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000675 if (tok->start == NULL) {
676 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000677 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000678 if (tok->buf == NULL) {
679 tok->done = E_NOMEM;
680 return EOF;
681 }
682 tok->end = tok->buf + BUFSIZ;
683 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
685 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000686 tok->done = E_EOF;
687 done = 1;
688 }
689 else {
690 tok->done = E_OK;
691 tok->inp = strchr(tok->buf, '\0');
692 done = tok->inp[-1] == '\n';
693 }
694 }
695 else {
696 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000697 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000698 tok->done = E_EOF;
699 done = 1;
700 }
701 else
702 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000703 }
704 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000705 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000706 while (!done) {
707 int curstart = tok->start == NULL ? -1 :
708 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000709 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000710 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000711 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000712 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000713 if (newbuf == NULL) {
714 tok->done = E_NOMEM;
715 tok->cur = tok->inp;
716 return EOF;
717 }
718 tok->buf = newbuf;
719 tok->inp = tok->buf + curvalid;
720 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000721 tok->start = curstart < 0 ? NULL :
722 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000723 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000724 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000725 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000726 /* Last line does not end in \n,
727 fake one */
728 strcpy(tok->inp, "\n");
729 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000730 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000731 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000732 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000733 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000734#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000735 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000736 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000737 pt = tok->inp - 2;
738 if (pt >= tok->buf && *pt == '\r') {
739 *pt++ = '\n';
740 *pt = '\0';
741 tok->inp = pt;
742 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000743#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744 }
745 if (tok->done != E_OK) {
746 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000747 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000748 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749 return EOF;
750 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000752 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753}
754
755
756/* Back-up one character */
757
758static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000759tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760{
761 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000762 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000763 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764 if (*tok->cur != c)
765 *tok->cur = c;
766 }
767}
768
769
770/* Return the token corresponding to a single character */
771
772int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000773PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774{
775 switch (c) {
776 case '(': return LPAR;
777 case ')': return RPAR;
778 case '[': return LSQB;
779 case ']': return RSQB;
780 case ':': return COLON;
781 case ',': return COMMA;
782 case ';': return SEMI;
783 case '+': return PLUS;
784 case '-': return MINUS;
785 case '*': return STAR;
786 case '/': return SLASH;
787 case '|': return VBAR;
788 case '&': return AMPER;
789 case '<': return LESS;
790 case '>': return GREATER;
791 case '=': return EQUAL;
792 case '.': return DOT;
793 case '%': return PERCENT;
794 case '`': return BACKQUOTE;
795 case '{': return LBRACE;
796 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000797 case '^': return CIRCUMFLEX;
798 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799 default: return OP;
800 }
801}
802
803
Guido van Rossumfbab9051991-10-20 20:25:03 +0000804int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000805PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000806{
807 switch (c1) {
808 case '=':
809 switch (c2) {
810 case '=': return EQEQUAL;
811 }
812 break;
813 case '!':
814 switch (c2) {
815 case '=': return NOTEQUAL;
816 }
817 break;
818 case '<':
819 switch (c2) {
820 case '>': return NOTEQUAL;
821 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000822 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000823 }
824 break;
825 case '>':
826 switch (c2) {
827 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000828 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000829 }
830 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000831 case '+':
832 switch (c2) {
833 case '=': return PLUSEQUAL;
834 }
835 break;
836 case '-':
837 switch (c2) {
838 case '=': return MINEQUAL;
839 }
840 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000841 case '*':
842 switch (c2) {
843 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000844 case '=': return STAREQUAL;
845 }
846 break;
847 case '/':
848 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000849 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000850 case '=': return SLASHEQUAL;
851 }
852 break;
853 case '|':
854 switch (c2) {
855 case '=': return VBAREQUAL;
856 }
857 break;
858 case '%':
859 switch (c2) {
860 case '=': return PERCENTEQUAL;
861 }
862 break;
863 case '&':
864 switch (c2) {
865 case '=': return AMPEREQUAL;
866 }
867 break;
868 case '^':
869 switch (c2) {
870 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000871 }
872 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000873 }
874 return OP;
875}
876
Thomas Wouters434d0822000-08-24 20:11:32 +0000877int
878PyToken_ThreeChars(int c1, int c2, int c3)
879{
880 switch (c1) {
881 case '<':
882 switch (c2) {
883 case '<':
884 switch (c3) {
885 case '=':
886 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000887 }
888 break;
889 }
890 break;
891 case '>':
892 switch (c2) {
893 case '>':
894 switch (c3) {
895 case '=':
896 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000897 }
898 break;
899 }
900 break;
901 case '*':
902 switch (c2) {
903 case '*':
904 switch (c3) {
905 case '=':
906 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000907 }
908 break;
909 }
910 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000911 case '/':
912 switch (c2) {
913 case '/':
914 switch (c3) {
915 case '=':
916 return DOUBLESLASHEQUAL;
917 }
918 break;
919 }
920 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000921 }
922 return OP;
923}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000924
Guido van Rossum926f13a1998-04-09 21:38:06 +0000925static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000926indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000927{
928 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000929 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000930 tok->cur = tok->inp;
931 return 1;
932 }
933 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000934 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
935 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000936 tok->altwarning = 0;
937 }
938 return 0;
939}
940
941
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942/* Get next token, after space stripping etc. */
943
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000944static int
945tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946{
947 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000948 int blankline;
949
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000950 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000951 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000952 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000953 blankline = 0;
954
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000955 /* Get indentation level */
956 if (tok->atbol) {
957 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000958 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000959 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000960 for (;;) {
961 c = tok_nextc(tok);
962 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000963 col++, altcol++;
964 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000965 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000966 altcol = (altcol/tok->alttabsize + 1)
967 * tok->alttabsize;
968 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000969 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000970 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000971 else
972 break;
973 }
974 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000975 if (c == '#' || c == '\n') {
976 /* Lines with only whitespace and/or comments
977 shouldn't affect the indentation and are
978 not passed to the parser as NEWLINE tokens,
979 except *totally* empty lines in interactive
980 mode, which signal the end of a command group. */
981 if (col == 0 && c == '\n' && tok->prompt != NULL)
982 blankline = 0; /* Let it through */
983 else
984 blankline = 1; /* Ignore completely */
985 /* We can't jump back right here since we still
986 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000987 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000988 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000989 if (col == tok->indstack[tok->indent]) {
990 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000991 if (altcol != tok->altindstack[tok->indent]) {
992 if (indenterror(tok))
993 return ERRORTOKEN;
994 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000996 else if (col > tok->indstack[tok->indent]) {
997 /* Indent -- always one */
998 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000999 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001000 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001001 return ERRORTOKEN;
1002 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001003 if (altcol <= tok->altindstack[tok->indent]) {
1004 if (indenterror(tok))
1005 return ERRORTOKEN;
1006 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001007 tok->pendin++;
1008 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001009 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001010 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001011 else /* col < tok->indstack[tok->indent] */ {
1012 /* Dedent -- any number, must be consistent */
1013 while (tok->indent > 0 &&
1014 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001015 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001016 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001017 }
1018 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001019 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001020 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001021 return ERRORTOKEN;
1022 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001023 if (altcol != tok->altindstack[tok->indent]) {
1024 if (indenterror(tok))
1025 return ERRORTOKEN;
1026 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001027 }
1028 }
1029 }
1030
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001031 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032
1033 /* Return pending indents/dedents */
1034 if (tok->pendin != 0) {
1035 if (tok->pendin < 0) {
1036 tok->pendin++;
1037 return DEDENT;
1038 }
1039 else {
1040 tok->pendin--;
1041 return INDENT;
1042 }
1043 }
1044
1045 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001046 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001047 /* Skip spaces */
1048 do {
1049 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001050 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001051
1052 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001053 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054
Guido van Rossumab5ca152000-03-31 00:52:27 +00001055 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001057 static char *tabforms[] = {
1058 "tab-width:", /* Emacs */
1059 ":tabstop=", /* vim, full form */
1060 ":ts=", /* vim, abbreviated form */
1061 "set tabsize=", /* will vi never die? */
1062 /* more templates can be added here to support other editors */
1063 };
1064 char cbuf[80];
1065 char *tp, **cp;
1066 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001067 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001068 *tp++ = c = tok_nextc(tok);
1069 } while (c != EOF && c != '\n' &&
1070 tp - cbuf + 1 < sizeof(cbuf));
1071 *tp = '\0';
1072 for (cp = tabforms;
1073 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1074 cp++) {
1075 if ((tp = strstr(cbuf, *cp))) {
1076 int newsize = atoi(tp + strlen(*cp));
1077
1078 if (newsize >= 1 && newsize <= 40) {
1079 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001080 if (Py_VerboseFlag)
1081 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001082 "Tab size set to %d\n",
1083 newsize);
1084 }
1085 }
1086 }
1087 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001088 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089 }
1090
1091 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001092 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001094 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001095
1096 /* Identifier (most frequent token!) */
1097 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001098 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001099 switch (c) {
1100 case 'r':
1101 case 'R':
1102 c = tok_nextc(tok);
1103 if (c == '"' || c == '\'')
1104 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001105 break;
1106 case 'u':
1107 case 'U':
1108 c = tok_nextc(tok);
1109 if (c == 'r' || c == 'R')
1110 c = tok_nextc(tok);
1111 if (c == '"' || c == '\'')
1112 goto letter_quote;
1113 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001114 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001115 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001117 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001119 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 *p_end = tok->cur;
1121 return NAME;
1122 }
1123
1124 /* Newline */
1125 if (c == '\n') {
1126 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001127 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001128 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001129 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1131 return NEWLINE;
1132 }
1133
Guido van Rossum2d45be11997-04-11 19:16:25 +00001134#ifdef macintosh
1135 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001136 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001137 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001138 tok->done = E_TOKEN;
1139 tok->cur = tok->inp;
1140 return ERRORTOKEN;
1141 }
1142#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001143 /* Period or number starting with period? */
1144 if (c == '.') {
1145 c = tok_nextc(tok);
1146 if (isdigit(c)) {
1147 goto fraction;
1148 }
1149 else {
1150 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001151 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001152 *p_end = tok->cur;
1153 return DOT;
1154 }
1155 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001156
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001157 /* Number */
1158 if (isdigit(c)) {
1159 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001160 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001161 c = tok_nextc(tok);
1162 if (c == '.')
1163 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001164#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001165 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001166 goto imaginary;
1167#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001168 if (c == 'x' || c == 'X') {
1169 /* Hex */
1170 do {
1171 c = tok_nextc(tok);
1172 } while (isxdigit(c));
1173 }
1174 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001175 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 /* Octal; c is first char of it */
1177 /* There's no 'isoctdigit' macro, sigh */
1178 while ('0' <= c && c < '8') {
1179 c = tok_nextc(tok);
1180 }
Tim Petersd507dab2001-08-30 20:51:59 +00001181 if (isdigit(c)) {
1182 found_decimal = 1;
1183 do {
1184 c = tok_nextc(tok);
1185 } while (isdigit(c));
1186 }
1187 if (c == '.')
1188 goto fraction;
1189 else if (c == 'e' || c == 'E')
1190 goto exponent;
1191#ifndef WITHOUT_COMPLEX
1192 else if (c == 'j' || c == 'J')
1193 goto imaginary;
1194#endif
1195 else if (found_decimal) {
1196 tok->done = E_TOKEN;
1197 tok_backup(tok, c);
1198 return ERRORTOKEN;
1199 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001201 if (c == 'l' || c == 'L')
1202 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203 }
1204 else {
1205 /* Decimal */
1206 do {
1207 c = tok_nextc(tok);
1208 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001209 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001210 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001211 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001212 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001213 if (c == '.') {
1214 fraction:
1215 /* Fraction */
1216 do {
1217 c = tok_nextc(tok);
1218 } while (isdigit(c));
1219 }
1220 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001221 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001222 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001224 if (c == '+' || c == '-')
1225 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001226 if (!isdigit(c)) {
1227 tok->done = E_TOKEN;
1228 tok_backup(tok, c);
1229 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001230 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001231 do {
1232 c = tok_nextc(tok);
1233 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001235#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001236 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001237 /* Imaginary part */
1238 imaginary:
1239 c = tok_nextc(tok);
1240#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001241 }
1242 }
1243 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001244 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 *p_end = tok->cur;
1246 return NUMBER;
1247 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001248
1249 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001250 /* String */
1251 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001252 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001253 int quote = c;
1254 int triple = 0;
1255 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 for (;;) {
1257 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001258 if (c == '\n') {
1259 if (!triple) {
1260 tok->done = E_TOKEN;
1261 tok_backup(tok, c);
1262 return ERRORTOKEN;
1263 }
1264 tripcount = 0;
1265 }
1266 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001268 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 return ERRORTOKEN;
1270 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001271 else if (c == quote) {
1272 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001273 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001274 c = tok_nextc(tok);
1275 if (c == quote) {
1276 triple = 1;
1277 tripcount = 0;
1278 continue;
1279 }
1280 tok_backup(tok, c);
1281 }
1282 if (!triple || tripcount == 3)
1283 break;
1284 }
1285 else if (c == '\\') {
1286 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001288 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001290 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 return ERRORTOKEN;
1292 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001294 else
1295 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001297 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001298 *p_end = tok->cur;
1299 return STRING;
1300 }
1301
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 /* Line continuation */
1303 if (c == '\\') {
1304 c = tok_nextc(tok);
1305 if (c != '\n') {
1306 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001307 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 return ERRORTOKEN;
1309 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 goto again; /* Read next line */
1311 }
1312
Guido van Rossumfbab9051991-10-20 20:25:03 +00001313 /* Check for two-character token */
1314 {
1315 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001316 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001317 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001318 int c3 = tok_nextc(tok);
1319 int token3 = PyToken_ThreeChars(c, c2, c3);
1320 if (token3 != OP) {
1321 token = token3;
1322 } else {
1323 tok_backup(tok, c3);
1324 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001325 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001326 *p_end = tok->cur;
1327 return token;
1328 }
1329 tok_backup(tok, c2);
1330 }
1331
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001332 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001333 switch (c) {
1334 case '(':
1335 case '[':
1336 case '{':
1337 tok->level++;
1338 break;
1339 case ')':
1340 case ']':
1341 case '}':
1342 tok->level--;
1343 break;
1344 }
1345
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001347 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001349 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350}
1351
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001352int
1353PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1354{
1355 int result = tok_get(tok, p_start, p_end);
1356 if (tok->decoding_erred) {
1357 result = ERRORTOKEN;
1358 tok->done = E_DECODE;
1359 }
1360 return result;
1361}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362
Guido van Rossum408027e1996-12-30 16:17:54 +00001363#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364
1365void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001366tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367{
Guido van Rossum86bea461997-04-29 21:03:06 +00001368 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1370 printf("(%.*s)", (int)(end - start), start);
1371}
1372
1373#endif