blob: 53c883f5e4c4775795303fd2664cea1883ea1304 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000131#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->decoding_readline = NULL;
133 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000135 return tok;
136}
137
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138#ifdef PGEN
139
140static char *
141decoding_fgets(char *s, int size, struct tok_state *tok)
142{
143 return fgets(s, size, tok->fp);
144}
145
146static int
147decoding_feof(struct tok_state *tok)
148{
149 return feof(tok->fp);
150}
151
152static const char *
153decode_str(const char *str, struct tok_state *tok)
154{
155 return str;
156}
157
158#else /* PGEN */
159
160static char *
161error_ret(struct tok_state *tok) /* XXX */
162{
163 tok->decoding_erred = 1;
164 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
165 PyMem_DEL(tok->buf);
166 tok->buf = NULL;
167 return NULL; /* as if it were EOF */
168}
169
170static char *
171new_string(const char *s, int len)
172{
173 char* result = PyMem_NEW(char, len + 1);
174 if (result != NULL) {
175 memcpy(result, s, len);
176 result[len] = '\0';
177 }
178 return result;
179}
180
181static char *
182get_normal_name(char *s) /* for utf-8 and latin-1 */
183{
184 char buf[13];
185 int i;
186 for (i = 0; i < 12; i++) {
187 int c = s[i];
188 if (c == '\0') break;
189 else if (c == '_') buf[i] = '-';
190 else buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
201 else return s;
202}
203
204/* Return the coding spec in S, or NULL if none is found. */
205
206static char *
207get_coding_spec(const char *s, int size)
208{
209 int i;
210 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
211 const char* t = s + i;
212 if (strncmp(t, "coding", 6) == 0) {
213 const char* begin = NULL;
214 t += 6;
215 if (t[0] != ':' && t[0] != '=')
216 continue;
217 do {
218 t++;
219 } while (t[0] == '\x20' || t[0] == '\t');
220
221 begin = t;
222 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
223 t[0] == '.')
224 t++;
225
226 if (begin < t) {
227 char* r = new_string(begin, t - begin);
228 char* q = get_normal_name(r);
229 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000230 PyMem_DEL(r);
231 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232 }
233 return r;
234 }
235 }
236 }
237 return NULL;
238}
239
240/* Check whether the line contains a coding spec. If it does,
241 invoke the set_readline function for the new encoding.
242 This function receives the tok_state and the new encoding.
243 Return 1 on success, 0 on failure. */
244
245static int
246check_coding_spec(const char* line, int size, struct tok_state *tok,
247 int set_readline(struct tok_state *, const char *))
248{
249 int r = 1;
250 char* cs = get_coding_spec(line, size);
251 if (cs != NULL) {
252 tok->read_coding_spec = 1;
253 if (tok->encoding == NULL) {
254 assert(tok->decoding_state == 1); /* raw */
255 if (strcmp(cs, "utf-8") == 0 ||
256 strcmp(cs, "iso-8859-1") == 0) {
257 tok->encoding = cs;
258 } else {
259 r = set_readline(tok, cs);
260 if (r) {
261 tok->encoding = cs;
262 tok->decoding_state = -1;
263 }
264 }
265 } else { /* then, compare cs with BOM */
266 r = (strcmp(tok->encoding, cs) == 0);
267 PyMem_DEL(cs);
268 }
269 }
270 return r;
271}
272
273/* See whether the file starts with a BOM. If it does,
274 invoke the set_readline function with the new encoding.
275 Return 1 on success, 0 on failure. */
276
277static int
278check_bom(int get_char(struct tok_state *),
279 void unget_char(int, struct tok_state *),
280 int set_readline(struct tok_state *, const char *),
281 struct tok_state *tok)
282{
283 int ch = get_char(tok);
284 tok->decoding_state = 1;
285 if (ch == EOF) {
286 return 1;
287 } else if (ch == 0xEF) {
288 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
289 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
290#if 0
291 /* Disable support for UTF-16 BOMs until a decision
292 is made whether this needs to be supported. */
293 } else if (ch == 0xFE) {
294 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
295 if (!set_readline(tok, "utf-16-be")) return 0;
296 tok->decoding_state = -1;
297 } else if (ch == 0xFF) {
298 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
299 if (!set_readline(tok, "utf-16-le")) return 0;
300 tok->decoding_state = -1;
301#endif
302 } else {
303 unget_char(ch, tok);
304 return 1;
305 }
306 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
307 return 1;
308 NON_BOM:
309 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
310 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
311 return 1;
312}
313
314/* Read a line of text from TOK into S, using the stream in TOK.
315 Return NULL on failure, else S. */
316
317static char *
318fp_readl(char *s, int size, struct tok_state *tok)
319{
320 PyObject* utf8;
321 PyObject* buf = tok->decoding_buffer;
322 if (buf == NULL) {
323 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000324 if (buf == NULL)
325 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326 } else {
327 tok->decoding_buffer = NULL;
328 }
329 utf8 = PyUnicode_AsUTF8String(buf);
330 Py_DECREF(buf);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000331 if (utf8 == NULL)
332 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333 else {
334 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000335 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000336 strcpy(s, str);
337 Py_DECREF(utf8);
338 if (s[0] == '\0') return NULL; /* EOF */
339 return s;
340 }
341}
342
343/* Set the readline function for TOK to a StreamReader's
344 readline function. The StreamReader is named ENC.
345
346 This function is called from check_bom and check_coding_spec.
347
348 ENC is usually identical to the future value of tok->encoding,
349 except for the (currently unsupported) case of UTF-16.
350
351 Return 1 on success, 0 on failure. */
352
353static int
354fp_setreadl(struct tok_state *tok, const char* enc)
355{
356 PyObject *reader, *stream, *readline;
357
358 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000359 if (stream == NULL)
360 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362 reader = PyCodec_StreamReader(enc, stream, NULL);
363 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000364 if (reader == NULL)
365 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366
367 readline = PyObject_GetAttrString(reader, "readline");
368 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000369 if (readline == NULL)
370 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371
372 tok->decoding_readline = readline;
373 return 1;
374}
375
376/* Fetch the next byte from TOK. */
377
378static int fp_getc(struct tok_state *tok) {
379 return getc(tok->fp);
380}
381
382/* Unfetch the last byte back into TOK. */
383
384static void fp_ungetc(int c, struct tok_state *tok) {
385 ungetc(c, tok->fp);
386}
387
388/* Read a line of input from TOK. Determine encoding
389 if necessary. */
390
391static char *
392decoding_fgets(char *s, int size, struct tok_state *tok)
393{
394 char *line;
395 int warn = 0, badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000396 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397 if (tok->decoding_state < 0) {
398 /* We already have a codec associated with
399 this input. */
400 line = fp_readl(s, size, tok);
401 break;
402 } else if (tok->decoding_state > 0) {
403 /* We want a 'raw' read. */
404 line = Py_UniversalNewlineFgets(s, size,
405 tok->fp, NULL);
406 warn = 1;
407 break;
408 } else {
409 /* We have not yet determined the encoding.
410 If an encoding is found, use the file-pointer
411 reader functions from now on. */
412 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
413 return error_ret(tok);
414 assert(tok->decoding_state != 0);
415 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000416 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000417 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
418 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
419 return error_ret(tok);
420 }
421 }
422#ifndef PGEN
423 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
424 unsigned char *c;
425 for (c = line; *c; c++)
426 if (*c > 127) {
427 badchar = *c;
428 break;
429 }
430 }
431 if (badchar) {
432 char buf[200];
433 sprintf(buf, "Non-ASCII character '\\x%.2x', "
434 "but no declared encoding", badchar);
435 PyErr_WarnExplicit(PyExc_DeprecationWarning,
436 buf, tok->filename, tok->lineno,
437 NULL, NULL);
438 tok->issued_encoding_warning = 1;
439 }
440#endif
441 return line;
442}
443
444static int
445decoding_feof(struct tok_state *tok)
446{
447 if (tok->decoding_state >= 0) {
448 return feof(tok->fp);
449 } else {
450 PyObject* buf = tok->decoding_buffer;
451 if (buf == NULL) {
452 buf = PyObject_CallObject(tok->decoding_readline, NULL);
453 if (buf == NULL) {
454 error_ret(tok);
455 return 1;
456 } else {
457 tok->decoding_buffer = buf;
458 }
459 }
460 return PyObject_Length(buf) == 0;
461 }
462}
463
464/* Fetch a byte from TOK, using the string buffer. */
465
466static int buf_getc(struct tok_state *tok) {
467 return *tok->str++;
468}
469
470/* Unfetch a byte from TOK, using the string buffer. */
471
472static void buf_ungetc(int c, struct tok_state *tok) {
473 tok->str--;
474 assert(*tok->str == c); /* tok->cur may point to read-only segment */
475}
476
477/* Set the readline function for TOK to ENC. For the string-based
478 tokenizer, this means to just record the encoding. */
479
480static int buf_setreadl(struct tok_state *tok, const char* enc) {
481 tok->enc = enc;
482 return 1;
483}
484
485/* Return a UTF-8 encoding Python string object from the
486 C byte string STR, which is encoded with ENC. */
487
488static PyObject *
489translate_into_utf8(const char* str, const char* enc) {
490 PyObject *utf8;
491 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
492 if (buf == NULL)
493 return NULL;
494 utf8 = PyUnicode_AsUTF8String(buf);
495 Py_DECREF(buf);
496 return utf8;
497}
498
499/* Decode a byte string STR for use as the buffer of TOK.
500 Look for encoding declarations inside STR, and record them
501 inside TOK. */
502
503static const char *
504decode_str(const char *str, struct tok_state *tok)
505{
506 PyObject* utf8 = NULL;
507 const char *s;
508 int lineno = 0;
509 tok->enc = NULL;
510 tok->str = str;
511 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
512 return NULL;
513 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000514 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 if (tok->enc != NULL) {
516 utf8 = translate_into_utf8(str, tok->enc);
517 if (utf8 == NULL)
518 return NULL;
519 str = PyString_AsString(utf8);
520 }
521 for (s = str;; s++) {
522 if (*s == '\0') break;
523 else if (*s == '\n') {
524 lineno++;
525 if (lineno == 2) break;
526 }
527 }
528 tok->enc = NULL;
529 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
530 return NULL;
531 if (tok->enc != NULL) {
532 assert(utf8 == NULL);
533 utf8 = translate_into_utf8(str, tok->enc);
534 if (utf8 == NULL)
535 return NULL;
536 str = PyString_AsString(utf8);
537 }
538 assert(tok->decoding_buffer == NULL);
539 tok->decoding_buffer = utf8; /* CAUTION */
540 return str;
541}
542
543#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000544
545/* Set up tokenizer for string */
546
547struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000548PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000549{
550 struct tok_state *tok = tok_new();
551 if (tok == NULL)
552 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553 str = (char *)decode_str(str, tok);
554 if (str == NULL)
555 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000556 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000557 return tok;
558}
559
560
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000561/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000562
563struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000564PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000565{
566 struct tok_state *tok = tok_new();
567 if (tok == NULL)
568 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000569 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
570 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000571 return NULL;
572 }
573 tok->cur = tok->inp = tok->buf;
574 tok->end = tok->buf + BUFSIZ;
575 tok->fp = fp;
576 tok->prompt = ps1;
577 tok->nextprompt = ps2;
578 return tok;
579}
580
581
582/* Free a tok_state structure */
583
584void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000585PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000586{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 if (tok->encoding != NULL)
588 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000589#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 Py_XDECREF(tok->decoding_readline);
591 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000592#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000593 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000594 PyMem_DEL(tok->buf);
595 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000596}
597
598
599/* Get next char, updating state; error code goes into tok->done */
600
601static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000602tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000604 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000605 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000606 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000607 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000608 if (tok->done != E_OK)
609 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000610 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000611 char *end = strchr(tok->inp, '\n');
612 if (end != NULL)
613 end++;
614 else {
615 end = strchr(tok->inp, '\0');
616 if (end == tok->inp) {
617 tok->done = E_EOF;
618 return EOF;
619 }
620 }
621 if (tok->start == NULL)
622 tok->buf = tok->cur;
623 tok->lineno++;
624 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000625 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000626 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000627 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000628 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000629 if (tok->nextprompt != NULL)
630 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000631 if (new == NULL)
632 tok->done = E_INTR;
633 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000634 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000635 tok->done = E_EOF;
636 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000637 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000638 size_t start = tok->start - tok->buf;
639 size_t oldlen = tok->cur - tok->buf;
640 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000641 char *buf = tok->buf;
642 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000643 tok->lineno++;
644 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000645 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000646 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000647 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000648 tok->done = E_NOMEM;
649 return EOF;
650 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000651 tok->buf = buf;
652 tok->cur = tok->buf + oldlen;
653 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000654 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000655 tok->inp = tok->buf + newlen;
656 tok->end = tok->inp + 1;
657 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000658 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000659 else {
660 tok->lineno++;
661 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000662 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000663 tok->buf = new;
664 tok->cur = tok->buf;
665 tok->inp = strchr(tok->buf, '\0');
666 tok->end = tok->inp + 1;
667 }
668 }
669 else {
670 int done = 0;
671 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000672 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000673 if (tok->start == NULL) {
674 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000675 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000676 if (tok->buf == NULL) {
677 tok->done = E_NOMEM;
678 return EOF;
679 }
680 tok->end = tok->buf + BUFSIZ;
681 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
683 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000684 tok->done = E_EOF;
685 done = 1;
686 }
687 else {
688 tok->done = E_OK;
689 tok->inp = strchr(tok->buf, '\0');
690 done = tok->inp[-1] == '\n';
691 }
692 }
693 else {
694 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000695 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000696 tok->done = E_EOF;
697 done = 1;
698 }
699 else
700 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000701 }
702 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000703 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000704 while (!done) {
705 int curstart = tok->start == NULL ? -1 :
706 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000707 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000708 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000709 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000710 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000711 if (newbuf == NULL) {
712 tok->done = E_NOMEM;
713 tok->cur = tok->inp;
714 return EOF;
715 }
716 tok->buf = newbuf;
717 tok->inp = tok->buf + curvalid;
718 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000719 tok->start = curstart < 0 ? NULL :
720 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000721 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000722 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000723 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000724 /* Last line does not end in \n,
725 fake one */
726 strcpy(tok->inp, "\n");
727 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000728 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000729 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000730 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000731 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000732#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000733 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000734 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000735 pt = tok->inp - 2;
736 if (pt >= tok->buf && *pt == '\r') {
737 *pt++ = '\n';
738 *pt = '\0';
739 tok->inp = pt;
740 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000741#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 }
743 if (tok->done != E_OK) {
744 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000745 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000746 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747 return EOF;
748 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000750 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751}
752
753
754/* Back-up one character */
755
756static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000757tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758{
759 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000760 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000761 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762 if (*tok->cur != c)
763 *tok->cur = c;
764 }
765}
766
767
768/* Return the token corresponding to a single character */
769
770int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000771PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772{
773 switch (c) {
774 case '(': return LPAR;
775 case ')': return RPAR;
776 case '[': return LSQB;
777 case ']': return RSQB;
778 case ':': return COLON;
779 case ',': return COMMA;
780 case ';': return SEMI;
781 case '+': return PLUS;
782 case '-': return MINUS;
783 case '*': return STAR;
784 case '/': return SLASH;
785 case '|': return VBAR;
786 case '&': return AMPER;
787 case '<': return LESS;
788 case '>': return GREATER;
789 case '=': return EQUAL;
790 case '.': return DOT;
791 case '%': return PERCENT;
792 case '`': return BACKQUOTE;
793 case '{': return LBRACE;
794 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000795 case '^': return CIRCUMFLEX;
796 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797 default: return OP;
798 }
799}
800
801
Guido van Rossumfbab9051991-10-20 20:25:03 +0000802int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000803PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000804{
805 switch (c1) {
806 case '=':
807 switch (c2) {
808 case '=': return EQEQUAL;
809 }
810 break;
811 case '!':
812 switch (c2) {
813 case '=': return NOTEQUAL;
814 }
815 break;
816 case '<':
817 switch (c2) {
818 case '>': return NOTEQUAL;
819 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000820 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000821 }
822 break;
823 case '>':
824 switch (c2) {
825 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000826 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000827 }
828 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000829 case '+':
830 switch (c2) {
831 case '=': return PLUSEQUAL;
832 }
833 break;
834 case '-':
835 switch (c2) {
836 case '=': return MINEQUAL;
837 }
838 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000839 case '*':
840 switch (c2) {
841 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000842 case '=': return STAREQUAL;
843 }
844 break;
845 case '/':
846 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000847 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000848 case '=': return SLASHEQUAL;
849 }
850 break;
851 case '|':
852 switch (c2) {
853 case '=': return VBAREQUAL;
854 }
855 break;
856 case '%':
857 switch (c2) {
858 case '=': return PERCENTEQUAL;
859 }
860 break;
861 case '&':
862 switch (c2) {
863 case '=': return AMPEREQUAL;
864 }
865 break;
866 case '^':
867 switch (c2) {
868 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000869 }
870 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000871 }
872 return OP;
873}
874
Thomas Wouters434d0822000-08-24 20:11:32 +0000875int
876PyToken_ThreeChars(int c1, int c2, int c3)
877{
878 switch (c1) {
879 case '<':
880 switch (c2) {
881 case '<':
882 switch (c3) {
883 case '=':
884 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000885 }
886 break;
887 }
888 break;
889 case '>':
890 switch (c2) {
891 case '>':
892 switch (c3) {
893 case '=':
894 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000895 }
896 break;
897 }
898 break;
899 case '*':
900 switch (c2) {
901 case '*':
902 switch (c3) {
903 case '=':
904 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000905 }
906 break;
907 }
908 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000909 case '/':
910 switch (c2) {
911 case '/':
912 switch (c3) {
913 case '=':
914 return DOUBLESLASHEQUAL;
915 }
916 break;
917 }
918 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000919 }
920 return OP;
921}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000922
Guido van Rossum926f13a1998-04-09 21:38:06 +0000923static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000924indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000925{
926 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000927 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000928 tok->cur = tok->inp;
929 return 1;
930 }
931 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000932 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
933 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000934 tok->altwarning = 0;
935 }
936 return 0;
937}
938
939
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000940/* Get next token, after space stripping etc. */
941
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000942static int
943tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000944{
945 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000946 int blankline;
947
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000948 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000949 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000950 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000951 blankline = 0;
952
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953 /* Get indentation level */
954 if (tok->atbol) {
955 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000956 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000958 for (;;) {
959 c = tok_nextc(tok);
960 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000961 col++, altcol++;
962 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000963 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000964 altcol = (altcol/tok->alttabsize + 1)
965 * tok->alttabsize;
966 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000967 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000968 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000969 else
970 break;
971 }
972 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000973 if (c == '#' || c == '\n') {
974 /* Lines with only whitespace and/or comments
975 shouldn't affect the indentation and are
976 not passed to the parser as NEWLINE tokens,
977 except *totally* empty lines in interactive
978 mode, which signal the end of a command group. */
979 if (col == 0 && c == '\n' && tok->prompt != NULL)
980 blankline = 0; /* Let it through */
981 else
982 blankline = 1; /* Ignore completely */
983 /* We can't jump back right here since we still
984 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000985 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000986 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000987 if (col == tok->indstack[tok->indent]) {
988 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000989 if (altcol != tok->altindstack[tok->indent]) {
990 if (indenterror(tok))
991 return ERRORTOKEN;
992 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000993 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000994 else if (col > tok->indstack[tok->indent]) {
995 /* Indent -- always one */
996 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000997 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000998 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000999 return ERRORTOKEN;
1000 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001001 if (altcol <= tok->altindstack[tok->indent]) {
1002 if (indenterror(tok))
1003 return ERRORTOKEN;
1004 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001005 tok->pendin++;
1006 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001007 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001008 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001009 else /* col < tok->indstack[tok->indent] */ {
1010 /* Dedent -- any number, must be consistent */
1011 while (tok->indent > 0 &&
1012 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001013 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001014 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001015 }
1016 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001017 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001018 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001019 return ERRORTOKEN;
1020 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001021 if (altcol != tok->altindstack[tok->indent]) {
1022 if (indenterror(tok))
1023 return ERRORTOKEN;
1024 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025 }
1026 }
1027 }
1028
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001029 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001030
1031 /* Return pending indents/dedents */
1032 if (tok->pendin != 0) {
1033 if (tok->pendin < 0) {
1034 tok->pendin++;
1035 return DEDENT;
1036 }
1037 else {
1038 tok->pendin--;
1039 return INDENT;
1040 }
1041 }
1042
1043 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001044 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001045 /* Skip spaces */
1046 do {
1047 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001048 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001049
1050 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001051 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001052
Guido van Rossumab5ca152000-03-31 00:52:27 +00001053 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001055 static char *tabforms[] = {
1056 "tab-width:", /* Emacs */
1057 ":tabstop=", /* vim, full form */
1058 ":ts=", /* vim, abbreviated form */
1059 "set tabsize=", /* will vi never die? */
1060 /* more templates can be added here to support other editors */
1061 };
1062 char cbuf[80];
1063 char *tp, **cp;
1064 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001065 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001066 *tp++ = c = tok_nextc(tok);
1067 } while (c != EOF && c != '\n' &&
1068 tp - cbuf + 1 < sizeof(cbuf));
1069 *tp = '\0';
1070 for (cp = tabforms;
1071 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1072 cp++) {
1073 if ((tp = strstr(cbuf, *cp))) {
1074 int newsize = atoi(tp + strlen(*cp));
1075
1076 if (newsize >= 1 && newsize <= 40) {
1077 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001078 if (Py_VerboseFlag)
1079 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001080 "Tab size set to %d\n",
1081 newsize);
1082 }
1083 }
1084 }
1085 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087 }
1088
1089 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001090 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001092 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093
1094 /* Identifier (most frequent token!) */
1095 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001096 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001097 switch (c) {
1098 case 'r':
1099 case 'R':
1100 c = tok_nextc(tok);
1101 if (c == '"' || c == '\'')
1102 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001103 break;
1104 case 'u':
1105 case 'U':
1106 c = tok_nextc(tok);
1107 if (c == 'r' || c == 'R')
1108 c = tok_nextc(tok);
1109 if (c == '"' || c == '\'')
1110 goto letter_quote;
1111 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001112 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001113 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001115 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001117 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118 *p_end = tok->cur;
1119 return NAME;
1120 }
1121
1122 /* Newline */
1123 if (c == '\n') {
1124 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001125 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001126 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001127 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001128 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1129 return NEWLINE;
1130 }
1131
Guido van Rossum2d45be11997-04-11 19:16:25 +00001132#ifdef macintosh
1133 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001134 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001135 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001136 tok->done = E_TOKEN;
1137 tok->cur = tok->inp;
1138 return ERRORTOKEN;
1139 }
1140#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001141 /* Period or number starting with period? */
1142 if (c == '.') {
1143 c = tok_nextc(tok);
1144 if (isdigit(c)) {
1145 goto fraction;
1146 }
1147 else {
1148 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001149 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001150 *p_end = tok->cur;
1151 return DOT;
1152 }
1153 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001154
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001155 /* Number */
1156 if (isdigit(c)) {
1157 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001158 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 c = tok_nextc(tok);
1160 if (c == '.')
1161 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001162#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001163 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001164 goto imaginary;
1165#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001166 if (c == 'x' || c == 'X') {
1167 /* Hex */
1168 do {
1169 c = tok_nextc(tok);
1170 } while (isxdigit(c));
1171 }
1172 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001173 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001174 /* Octal; c is first char of it */
1175 /* There's no 'isoctdigit' macro, sigh */
1176 while ('0' <= c && c < '8') {
1177 c = tok_nextc(tok);
1178 }
Tim Petersd507dab2001-08-30 20:51:59 +00001179 if (isdigit(c)) {
1180 found_decimal = 1;
1181 do {
1182 c = tok_nextc(tok);
1183 } while (isdigit(c));
1184 }
1185 if (c == '.')
1186 goto fraction;
1187 else if (c == 'e' || c == 'E')
1188 goto exponent;
1189#ifndef WITHOUT_COMPLEX
1190 else if (c == 'j' || c == 'J')
1191 goto imaginary;
1192#endif
1193 else if (found_decimal) {
1194 tok->done = E_TOKEN;
1195 tok_backup(tok, c);
1196 return ERRORTOKEN;
1197 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001199 if (c == 'l' || c == 'L')
1200 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001201 }
1202 else {
1203 /* Decimal */
1204 do {
1205 c = tok_nextc(tok);
1206 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001207 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001209 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001210 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001211 if (c == '.') {
1212 fraction:
1213 /* Fraction */
1214 do {
1215 c = tok_nextc(tok);
1216 } while (isdigit(c));
1217 }
1218 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001219 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001220 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001222 if (c == '+' || c == '-')
1223 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001224 if (!isdigit(c)) {
1225 tok->done = E_TOKEN;
1226 tok_backup(tok, c);
1227 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001228 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001229 do {
1230 c = tok_nextc(tok);
1231 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001233#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001234 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001235 /* Imaginary part */
1236 imaginary:
1237 c = tok_nextc(tok);
1238#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001239 }
1240 }
1241 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001242 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001243 *p_end = tok->cur;
1244 return NUMBER;
1245 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001246
1247 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001248 /* String */
1249 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001250 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001251 int quote = c;
1252 int triple = 0;
1253 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 for (;;) {
1255 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001256 if (c == '\n') {
1257 if (!triple) {
1258 tok->done = E_TOKEN;
1259 tok_backup(tok, c);
1260 return ERRORTOKEN;
1261 }
1262 tripcount = 0;
1263 }
1264 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001265 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001266 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 return ERRORTOKEN;
1268 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001269 else if (c == quote) {
1270 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001271 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001272 c = tok_nextc(tok);
1273 if (c == quote) {
1274 triple = 1;
1275 tripcount = 0;
1276 continue;
1277 }
1278 tok_backup(tok, c);
1279 }
1280 if (!triple || tripcount == 3)
1281 break;
1282 }
1283 else if (c == '\\') {
1284 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001286 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001288 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 return ERRORTOKEN;
1290 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001292 else
1293 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001295 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001296 *p_end = tok->cur;
1297 return STRING;
1298 }
1299
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 /* Line continuation */
1301 if (c == '\\') {
1302 c = tok_nextc(tok);
1303 if (c != '\n') {
1304 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001305 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 return ERRORTOKEN;
1307 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 goto again; /* Read next line */
1309 }
1310
Guido van Rossumfbab9051991-10-20 20:25:03 +00001311 /* Check for two-character token */
1312 {
1313 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001314 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001315 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001316 int c3 = tok_nextc(tok);
1317 int token3 = PyToken_ThreeChars(c, c2, c3);
1318 if (token3 != OP) {
1319 token = token3;
1320 } else {
1321 tok_backup(tok, c3);
1322 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001323 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001324 *p_end = tok->cur;
1325 return token;
1326 }
1327 tok_backup(tok, c2);
1328 }
1329
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001330 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001331 switch (c) {
1332 case '(':
1333 case '[':
1334 case '{':
1335 tok->level++;
1336 break;
1337 case ')':
1338 case ']':
1339 case '}':
1340 tok->level--;
1341 break;
1342 }
1343
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001345 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001347 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348}
1349
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001350int
1351PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1352{
1353 int result = tok_get(tok, p_start, p_end);
1354 if (tok->decoding_erred) {
1355 result = ERRORTOKEN;
1356 tok->done = E_DECODE;
1357 }
1358 return result;
1359}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360
Guido van Rossum408027e1996-12-30 16:17:54 +00001361#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362
1363void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001364tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365{
Guido van Rossum86bea461997-04-29 21:03:06 +00001366 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1368 printf("(%.*s)", (int)(end - start), start);
1369}
1370
1371#endif