blob: 6b37b3196a556559b11495c85621e4dfc73f006c [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Tim Petersdbd9ba62000-07-09 03:09:57 +000021extern char *PyOS_Readline(char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Guido van Rossum86bea461997-04-29 21:03:06 +0000107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 if (tok == NULL)
109 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 tok->done = E_OK;
112 tok->fp = NULL;
113 tok->tabsize = TABSIZE;
114 tok->indent = 0;
115 tok->indstack[0] = 0;
116 tok->atbol = 1;
117 tok->pendin = 0;
118 tok->prompt = tok->nextprompt = NULL;
119 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000120 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000121 tok->filename = NULL;
122 tok->altwarning = 0;
123 tok->alterror = 0;
124 tok->alttabsize = 1;
125 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126 tok->decoding_state = 0;
127 tok->decoding_erred = 0;
128 tok->read_coding_spec = 0;
129 tok->issued_encoding_warning = 0;
130 tok->encoding = NULL;
131 tok->decoding_readline = NULL;
132 tok->decoding_buffer = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000133 return tok;
134}
135
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136#ifdef PGEN
137
138static char *
139decoding_fgets(char *s, int size, struct tok_state *tok)
140{
141 return fgets(s, size, tok->fp);
142}
143
144static int
145decoding_feof(struct tok_state *tok)
146{
147 return feof(tok->fp);
148}
149
150static const char *
151decode_str(const char *str, struct tok_state *tok)
152{
153 return str;
154}
155
156#else /* PGEN */
157
158static char *
159error_ret(struct tok_state *tok) /* XXX */
160{
161 tok->decoding_erred = 1;
162 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
163 PyMem_DEL(tok->buf);
164 tok->buf = NULL;
165 return NULL; /* as if it were EOF */
166}
167
168static char *
169new_string(const char *s, int len)
170{
171 char* result = PyMem_NEW(char, len + 1);
172 if (result != NULL) {
173 memcpy(result, s, len);
174 result[len] = '\0';
175 }
176 return result;
177}
178
179static char *
180get_normal_name(char *s) /* for utf-8 and latin-1 */
181{
182 char buf[13];
183 int i;
184 for (i = 0; i < 12; i++) {
185 int c = s[i];
186 if (c == '\0') break;
187 else if (c == '_') buf[i] = '-';
188 else buf[i] = tolower(c);
189 }
190 buf[i] = '\0';
191 if (strcmp(buf, "utf-8") == 0 ||
192 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
193 else if (strcmp(buf, "latin-1") == 0 ||
194 strcmp(buf, "iso-8859-1") == 0 ||
195 strcmp(buf, "iso-latin-1") == 0 ||
196 strncmp(buf, "latin-1-", 8) == 0 ||
197 strncmp(buf, "iso-8859-1-", 11) == 0 ||
198 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
199 else return s;
200}
201
202/* Return the coding spec in S, or NULL if none is found. */
203
204static char *
205get_coding_spec(const char *s, int size)
206{
207 int i;
208 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
209 const char* t = s + i;
210 if (strncmp(t, "coding", 6) == 0) {
211 const char* begin = NULL;
212 t += 6;
213 if (t[0] != ':' && t[0] != '=')
214 continue;
215 do {
216 t++;
217 } while (t[0] == '\x20' || t[0] == '\t');
218
219 begin = t;
220 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
221 t[0] == '.')
222 t++;
223
224 if (begin < t) {
225 char* r = new_string(begin, t - begin);
226 char* q = get_normal_name(r);
227 if (r != q) {
228 assert(strlen(r) >= strlen(q));
229 strcpy(r, q);
230 }
231 return r;
232 }
233 }
234 }
235 return NULL;
236}
237
238/* Check whether the line contains a coding spec. If it does,
239 invoke the set_readline function for the new encoding.
240 This function receives the tok_state and the new encoding.
241 Return 1 on success, 0 on failure. */
242
243static int
244check_coding_spec(const char* line, int size, struct tok_state *tok,
245 int set_readline(struct tok_state *, const char *))
246{
247 int r = 1;
248 char* cs = get_coding_spec(line, size);
249 if (cs != NULL) {
250 tok->read_coding_spec = 1;
251 if (tok->encoding == NULL) {
252 assert(tok->decoding_state == 1); /* raw */
253 if (strcmp(cs, "utf-8") == 0 ||
254 strcmp(cs, "iso-8859-1") == 0) {
255 tok->encoding = cs;
256 } else {
257 r = set_readline(tok, cs);
258 if (r) {
259 tok->encoding = cs;
260 tok->decoding_state = -1;
261 }
262 }
263 } else { /* then, compare cs with BOM */
264 r = (strcmp(tok->encoding, cs) == 0);
265 PyMem_DEL(cs);
266 }
267 }
268 return r;
269}
270
271/* See whether the file starts with a BOM. If it does,
272 invoke the set_readline function with the new encoding.
273 Return 1 on success, 0 on failure. */
274
275static int
276check_bom(int get_char(struct tok_state *),
277 void unget_char(int, struct tok_state *),
278 int set_readline(struct tok_state *, const char *),
279 struct tok_state *tok)
280{
281 int ch = get_char(tok);
282 tok->decoding_state = 1;
283 if (ch == EOF) {
284 return 1;
285 } else if (ch == 0xEF) {
286 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
287 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
288#if 0
289 /* Disable support for UTF-16 BOMs until a decision
290 is made whether this needs to be supported. */
291 } else if (ch == 0xFE) {
292 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
293 if (!set_readline(tok, "utf-16-be")) return 0;
294 tok->decoding_state = -1;
295 } else if (ch == 0xFF) {
296 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
297 if (!set_readline(tok, "utf-16-le")) return 0;
298 tok->decoding_state = -1;
299#endif
300 } else {
301 unget_char(ch, tok);
302 return 1;
303 }
304 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
305 return 1;
306 NON_BOM:
307 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
308 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
309 return 1;
310}
311
312/* Read a line of text from TOK into S, using the stream in TOK.
313 Return NULL on failure, else S. */
314
315static char *
316fp_readl(char *s, int size, struct tok_state *tok)
317{
318 PyObject* utf8;
319 PyObject* buf = tok->decoding_buffer;
320 if (buf == NULL) {
321 buf = PyObject_CallObject(tok->decoding_readline, NULL);
322 if (buf == NULL) return error_ret(tok);
323 } else {
324 tok->decoding_buffer = NULL;
325 }
326 utf8 = PyUnicode_AsUTF8String(buf);
327 Py_DECREF(buf);
328 if (utf8 == NULL) return error_ret(tok);
329 else {
330 const char* str = PyString_AsString(utf8);
Tim Peters919603b2002-08-04 17:56:42 +0000331 assert(strlen(str) < (size_t)size); /* XXX */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 strcpy(s, str);
333 Py_DECREF(utf8);
334 if (s[0] == '\0') return NULL; /* EOF */
335 return s;
336 }
337}
338
339/* Set the readline function for TOK to a StreamReader's
340 readline function. The StreamReader is named ENC.
341
342 This function is called from check_bom and check_coding_spec.
343
344 ENC is usually identical to the future value of tok->encoding,
345 except for the (currently unsupported) case of UTF-16.
346
347 Return 1 on success, 0 on failure. */
348
349static int
350fp_setreadl(struct tok_state *tok, const char* enc)
351{
352 PyObject *reader, *stream, *readline;
353
354 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
355 if (stream == NULL) return 0;
356
357 reader = PyCodec_StreamReader(enc, stream, NULL);
358 Py_DECREF(stream);
359 if (reader == NULL) return 0;
360
361 readline = PyObject_GetAttrString(reader, "readline");
362 Py_DECREF(reader);
363 if (readline == NULL) return 0;
364
365 tok->decoding_readline = readline;
366 return 1;
367}
368
369/* Fetch the next byte from TOK. */
370
371static int fp_getc(struct tok_state *tok) {
372 return getc(tok->fp);
373}
374
375/* Unfetch the last byte back into TOK. */
376
377static void fp_ungetc(int c, struct tok_state *tok) {
378 ungetc(c, tok->fp);
379}
380
381/* Read a line of input from TOK. Determine encoding
382 if necessary. */
383
384static char *
385decoding_fgets(char *s, int size, struct tok_state *tok)
386{
387 char *line;
388 int warn = 0, badchar = 0;
389 for (;;)
390 if (tok->decoding_state < 0) {
391 /* We already have a codec associated with
392 this input. */
393 line = fp_readl(s, size, tok);
394 break;
395 } else if (tok->decoding_state > 0) {
396 /* We want a 'raw' read. */
397 line = Py_UniversalNewlineFgets(s, size,
398 tok->fp, NULL);
399 warn = 1;
400 break;
401 } else {
402 /* We have not yet determined the encoding.
403 If an encoding is found, use the file-pointer
404 reader functions from now on. */
405 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
406 return error_ret(tok);
407 assert(tok->decoding_state != 0);
408 }
409 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
410 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
411 return error_ret(tok);
412 }
413 }
414#ifndef PGEN
415 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
416 unsigned char *c;
417 for (c = line; *c; c++)
418 if (*c > 127) {
419 badchar = *c;
420 break;
421 }
422 }
423 if (badchar) {
424 char buf[200];
425 sprintf(buf, "Non-ASCII character '\\x%.2x', "
426 "but no declared encoding", badchar);
427 PyErr_WarnExplicit(PyExc_DeprecationWarning,
428 buf, tok->filename, tok->lineno,
429 NULL, NULL);
430 tok->issued_encoding_warning = 1;
431 }
432#endif
433 return line;
434}
435
436static int
437decoding_feof(struct tok_state *tok)
438{
439 if (tok->decoding_state >= 0) {
440 return feof(tok->fp);
441 } else {
442 PyObject* buf = tok->decoding_buffer;
443 if (buf == NULL) {
444 buf = PyObject_CallObject(tok->decoding_readline, NULL);
445 if (buf == NULL) {
446 error_ret(tok);
447 return 1;
448 } else {
449 tok->decoding_buffer = buf;
450 }
451 }
452 return PyObject_Length(buf) == 0;
453 }
454}
455
456/* Fetch a byte from TOK, using the string buffer. */
457
458static int buf_getc(struct tok_state *tok) {
459 return *tok->str++;
460}
461
462/* Unfetch a byte from TOK, using the string buffer. */
463
464static void buf_ungetc(int c, struct tok_state *tok) {
465 tok->str--;
466 assert(*tok->str == c); /* tok->cur may point to read-only segment */
467}
468
469/* Set the readline function for TOK to ENC. For the string-based
470 tokenizer, this means to just record the encoding. */
471
472static int buf_setreadl(struct tok_state *tok, const char* enc) {
473 tok->enc = enc;
474 return 1;
475}
476
477/* Return a UTF-8 encoding Python string object from the
478 C byte string STR, which is encoded with ENC. */
479
480static PyObject *
481translate_into_utf8(const char* str, const char* enc) {
482 PyObject *utf8;
483 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
484 if (buf == NULL)
485 return NULL;
486 utf8 = PyUnicode_AsUTF8String(buf);
487 Py_DECREF(buf);
488 return utf8;
489}
490
491/* Decode a byte string STR for use as the buffer of TOK.
492 Look for encoding declarations inside STR, and record them
493 inside TOK. */
494
495static const char *
496decode_str(const char *str, struct tok_state *tok)
497{
498 PyObject* utf8 = NULL;
499 const char *s;
500 int lineno = 0;
501 tok->enc = NULL;
502 tok->str = str;
503 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
504 return NULL;
505 str = tok->str; /* string after BOM if any */
506 assert(r);
507 if (tok->enc != NULL) {
508 utf8 = translate_into_utf8(str, tok->enc);
509 if (utf8 == NULL)
510 return NULL;
511 str = PyString_AsString(utf8);
512 }
513 for (s = str;; s++) {
514 if (*s == '\0') break;
515 else if (*s == '\n') {
516 lineno++;
517 if (lineno == 2) break;
518 }
519 }
520 tok->enc = NULL;
521 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
522 return NULL;
523 if (tok->enc != NULL) {
524 assert(utf8 == NULL);
525 utf8 = translate_into_utf8(str, tok->enc);
526 if (utf8 == NULL)
527 return NULL;
528 str = PyString_AsString(utf8);
529 }
530 assert(tok->decoding_buffer == NULL);
531 tok->decoding_buffer = utf8; /* CAUTION */
532 return str;
533}
534
535#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000536
537/* Set up tokenizer for string */
538
539struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000540PyTokenizer_FromString(char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000541{
542 struct tok_state *tok = tok_new();
543 if (tok == NULL)
544 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545 str = (char *)decode_str(str, tok);
546 if (str == NULL)
547 return NULL;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000548 tok->buf = tok->cur = tok->end = tok->inp = str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000549 return tok;
550}
551
552
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000553/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000554
555struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000556PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000557{
558 struct tok_state *tok = tok_new();
559 if (tok == NULL)
560 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000561 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
562 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000563 return NULL;
564 }
565 tok->cur = tok->inp = tok->buf;
566 tok->end = tok->buf + BUFSIZ;
567 tok->fp = fp;
568 tok->prompt = ps1;
569 tok->nextprompt = ps2;
570 return tok;
571}
572
573
574/* Free a tok_state structure */
575
576void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000577PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000578{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579 if (tok->encoding != NULL)
580 PyMem_DEL(tok->encoding);
581 Py_XDECREF(tok->decoding_readline);
582 Py_XDECREF(tok->decoding_buffer);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000583 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000584 PyMem_DEL(tok->buf);
585 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000586}
587
588
589/* Get next char, updating state; error code goes into tok->done */
590
591static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000592tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000593{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000594 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000595 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000596 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000597 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000598 if (tok->done != E_OK)
599 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000600 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000601 char *end = strchr(tok->inp, '\n');
602 if (end != NULL)
603 end++;
604 else {
605 end = strchr(tok->inp, '\0');
606 if (end == tok->inp) {
607 tok->done = E_EOF;
608 return EOF;
609 }
610 }
611 if (tok->start == NULL)
612 tok->buf = tok->cur;
613 tok->lineno++;
614 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000615 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000616 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617 if (tok->prompt != NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000618 char *new = PyOS_Readline(tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619 if (tok->nextprompt != NULL)
620 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000621 if (new == NULL)
622 tok->done = E_INTR;
623 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000624 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625 tok->done = E_EOF;
626 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000627 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000628 size_t start = tok->start - tok->buf;
629 size_t oldlen = tok->cur - tok->buf;
630 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000631 char *buf = tok->buf;
632 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000633 tok->lineno++;
634 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000635 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000636 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000637 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000638 tok->done = E_NOMEM;
639 return EOF;
640 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000641 tok->buf = buf;
642 tok->cur = tok->buf + oldlen;
643 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000644 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000645 tok->inp = tok->buf + newlen;
646 tok->end = tok->inp + 1;
647 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000648 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000649 else {
650 tok->lineno++;
651 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000652 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000653 tok->buf = new;
654 tok->cur = tok->buf;
655 tok->inp = strchr(tok->buf, '\0');
656 tok->end = tok->inp + 1;
657 }
658 }
659 else {
660 int done = 0;
661 int cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000662 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000663 if (tok->start == NULL) {
664 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000665 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000666 if (tok->buf == NULL) {
667 tok->done = E_NOMEM;
668 return EOF;
669 }
670 tok->end = tok->buf + BUFSIZ;
671 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000672 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
673 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000674 tok->done = E_EOF;
675 done = 1;
676 }
677 else {
678 tok->done = E_OK;
679 tok->inp = strchr(tok->buf, '\0');
680 done = tok->inp[-1] == '\n';
681 }
682 }
683 else {
684 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000685 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000686 tok->done = E_EOF;
687 done = 1;
688 }
689 else
690 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000691 }
692 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000693 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000694 while (!done) {
695 int curstart = tok->start == NULL ? -1 :
696 tok->start - tok->buf;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000697 int curvalid = tok->inp - tok->buf;
Guido van Rossum3f6bb861995-09-21 20:36:34 +0000698 int newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000699 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000700 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000701 if (newbuf == NULL) {
702 tok->done = E_NOMEM;
703 tok->cur = tok->inp;
704 return EOF;
705 }
706 tok->buf = newbuf;
707 tok->inp = tok->buf + curvalid;
708 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000709 tok->start = curstart < 0 ? NULL :
710 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000711 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000712 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000714 /* Last line does not end in \n,
715 fake one */
716 strcpy(tok->inp, "\n");
717 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000718 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000719 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000720 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000721 tok->cur = tok->buf + cur;
Guido van Rossum2d45be11997-04-11 19:16:25 +0000722#ifndef macintosh
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000723 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000724 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000725 pt = tok->inp - 2;
726 if (pt >= tok->buf && *pt == '\r') {
727 *pt++ = '\n';
728 *pt = '\0';
729 tok->inp = pt;
730 }
Guido van Rossum2d45be11997-04-11 19:16:25 +0000731#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732 }
733 if (tok->done != E_OK) {
734 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000735 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000736 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 return EOF;
738 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000740 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741}
742
743
744/* Back-up one character */
745
746static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000747tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748{
749 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000750 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000751 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752 if (*tok->cur != c)
753 *tok->cur = c;
754 }
755}
756
757
758/* Return the token corresponding to a single character */
759
760int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000761PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762{
763 switch (c) {
764 case '(': return LPAR;
765 case ')': return RPAR;
766 case '[': return LSQB;
767 case ']': return RSQB;
768 case ':': return COLON;
769 case ',': return COMMA;
770 case ';': return SEMI;
771 case '+': return PLUS;
772 case '-': return MINUS;
773 case '*': return STAR;
774 case '/': return SLASH;
775 case '|': return VBAR;
776 case '&': return AMPER;
777 case '<': return LESS;
778 case '>': return GREATER;
779 case '=': return EQUAL;
780 case '.': return DOT;
781 case '%': return PERCENT;
782 case '`': return BACKQUOTE;
783 case '{': return LBRACE;
784 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000785 case '^': return CIRCUMFLEX;
786 case '~': return TILDE;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787 default: return OP;
788 }
789}
790
791
Guido van Rossumfbab9051991-10-20 20:25:03 +0000792int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000793PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000794{
795 switch (c1) {
796 case '=':
797 switch (c2) {
798 case '=': return EQEQUAL;
799 }
800 break;
801 case '!':
802 switch (c2) {
803 case '=': return NOTEQUAL;
804 }
805 break;
806 case '<':
807 switch (c2) {
808 case '>': return NOTEQUAL;
809 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000810 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000811 }
812 break;
813 case '>':
814 switch (c2) {
815 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000816 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000817 }
818 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000819 case '+':
820 switch (c2) {
821 case '=': return PLUSEQUAL;
822 }
823 break;
824 case '-':
825 switch (c2) {
826 case '=': return MINEQUAL;
827 }
828 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000829 case '*':
830 switch (c2) {
831 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000832 case '=': return STAREQUAL;
833 }
834 break;
835 case '/':
836 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000837 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000838 case '=': return SLASHEQUAL;
839 }
840 break;
841 case '|':
842 switch (c2) {
843 case '=': return VBAREQUAL;
844 }
845 break;
846 case '%':
847 switch (c2) {
848 case '=': return PERCENTEQUAL;
849 }
850 break;
851 case '&':
852 switch (c2) {
853 case '=': return AMPEREQUAL;
854 }
855 break;
856 case '^':
857 switch (c2) {
858 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000859 }
860 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000861 }
862 return OP;
863}
864
Thomas Wouters434d0822000-08-24 20:11:32 +0000865int
866PyToken_ThreeChars(int c1, int c2, int c3)
867{
868 switch (c1) {
869 case '<':
870 switch (c2) {
871 case '<':
872 switch (c3) {
873 case '=':
874 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000875 }
876 break;
877 }
878 break;
879 case '>':
880 switch (c2) {
881 case '>':
882 switch (c3) {
883 case '=':
884 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000885 }
886 break;
887 }
888 break;
889 case '*':
890 switch (c2) {
891 case '*':
892 switch (c3) {
893 case '=':
894 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000895 }
896 break;
897 }
898 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000899 case '/':
900 switch (c2) {
901 case '/':
902 switch (c3) {
903 case '=':
904 return DOUBLESLASHEQUAL;
905 }
906 break;
907 }
908 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000909 }
910 return OP;
911}
Guido van Rossumfbab9051991-10-20 20:25:03 +0000912
Guido van Rossum926f13a1998-04-09 21:38:06 +0000913static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000914indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +0000915{
916 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +0000917 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000918 tok->cur = tok->inp;
919 return 1;
920 }
921 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +0000922 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
923 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +0000924 tok->altwarning = 0;
925 }
926 return 0;
927}
928
929
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000930/* Get next token, after space stripping etc. */
931
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000932static int
933tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000934{
935 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000936 int blankline;
937
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000938 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000939 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000940 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000941 blankline = 0;
942
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000943 /* Get indentation level */
944 if (tok->atbol) {
945 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000946 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000947 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000948 for (;;) {
949 c = tok_nextc(tok);
950 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +0000951 col++, altcol++;
952 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000954 altcol = (altcol/tok->alttabsize + 1)
955 * tok->alttabsize;
956 }
Guido van Rossum94d32b11995-07-07 22:27:27 +0000957 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000958 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000959 else
960 break;
961 }
962 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000963 if (c == '#' || c == '\n') {
964 /* Lines with only whitespace and/or comments
965 shouldn't affect the indentation and are
966 not passed to the parser as NEWLINE tokens,
967 except *totally* empty lines in interactive
968 mode, which signal the end of a command group. */
969 if (col == 0 && c == '\n' && tok->prompt != NULL)
970 blankline = 0; /* Let it through */
971 else
972 blankline = 1; /* Ignore completely */
973 /* We can't jump back right here since we still
974 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000975 }
Guido van Rossuma849b831993-05-12 11:35:44 +0000976 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000977 if (col == tok->indstack[tok->indent]) {
978 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +0000979 if (altcol != tok->altindstack[tok->indent]) {
980 if (indenterror(tok))
981 return ERRORTOKEN;
982 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000984 else if (col > tok->indstack[tok->indent]) {
985 /* Indent -- always one */
986 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +0000987 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000988 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000989 return ERRORTOKEN;
990 }
Guido van Rossum926f13a1998-04-09 21:38:06 +0000991 if (altcol <= tok->altindstack[tok->indent]) {
992 if (indenterror(tok))
993 return ERRORTOKEN;
994 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000995 tok->pendin++;
996 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000997 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000998 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000999 else /* col < tok->indstack[tok->indent] */ {
1000 /* Dedent -- any number, must be consistent */
1001 while (tok->indent > 0 &&
1002 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001003 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001004 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001005 }
1006 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001007 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001008 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001009 return ERRORTOKEN;
1010 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001011 if (altcol != tok->altindstack[tok->indent]) {
1012 if (indenterror(tok))
1013 return ERRORTOKEN;
1014 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001015 }
1016 }
1017 }
1018
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001019 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001020
1021 /* Return pending indents/dedents */
1022 if (tok->pendin != 0) {
1023 if (tok->pendin < 0) {
1024 tok->pendin++;
1025 return DEDENT;
1026 }
1027 else {
1028 tok->pendin--;
1029 return INDENT;
1030 }
1031 }
1032
1033 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001034 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001035 /* Skip spaces */
1036 do {
1037 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001038 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039
1040 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001041 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001042
Guido van Rossumab5ca152000-03-31 00:52:27 +00001043 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001044 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001045 static char *tabforms[] = {
1046 "tab-width:", /* Emacs */
1047 ":tabstop=", /* vim, full form */
1048 ":ts=", /* vim, abbreviated form */
1049 "set tabsize=", /* will vi never die? */
1050 /* more templates can be added here to support other editors */
1051 };
1052 char cbuf[80];
1053 char *tp, **cp;
1054 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001055 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001056 *tp++ = c = tok_nextc(tok);
1057 } while (c != EOF && c != '\n' &&
1058 tp - cbuf + 1 < sizeof(cbuf));
1059 *tp = '\0';
1060 for (cp = tabforms;
1061 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1062 cp++) {
1063 if ((tp = strstr(cbuf, *cp))) {
1064 int newsize = atoi(tp + strlen(*cp));
1065
1066 if (newsize >= 1 && newsize <= 40) {
1067 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001068 if (Py_VerboseFlag)
1069 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001070 "Tab size set to %d\n",
1071 newsize);
1072 }
1073 }
1074 }
1075 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001077 }
1078
1079 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001080 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001081 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001082 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001083
1084 /* Identifier (most frequent token!) */
1085 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001086 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001087 switch (c) {
1088 case 'r':
1089 case 'R':
1090 c = tok_nextc(tok);
1091 if (c == '"' || c == '\'')
1092 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001093 break;
1094 case 'u':
1095 case 'U':
1096 c = tok_nextc(tok);
1097 if (c == 'r' || c == 'R')
1098 c = tok_nextc(tok);
1099 if (c == '"' || c == '\'')
1100 goto letter_quote;
1101 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001102 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001103 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001105 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001106 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001107 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108 *p_end = tok->cur;
1109 return NAME;
1110 }
1111
1112 /* Newline */
1113 if (c == '\n') {
1114 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001115 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001116 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001117 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1119 return NEWLINE;
1120 }
1121
Guido van Rossum2d45be11997-04-11 19:16:25 +00001122#ifdef macintosh
1123 if (c == '\r') {
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001124 PySys_WriteStderr(
Guido van Rossum86bea461997-04-29 21:03:06 +00001125 "File contains \\r characters (incorrect line endings?)\n");
Guido van Rossum2d45be11997-04-11 19:16:25 +00001126 tok->done = E_TOKEN;
1127 tok->cur = tok->inp;
1128 return ERRORTOKEN;
1129 }
1130#endif
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001131 /* Period or number starting with period? */
1132 if (c == '.') {
1133 c = tok_nextc(tok);
1134 if (isdigit(c)) {
1135 goto fraction;
1136 }
1137 else {
1138 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001139 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001140 *p_end = tok->cur;
1141 return DOT;
1142 }
1143 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001144
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001145 /* Number */
1146 if (isdigit(c)) {
1147 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001148 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 c = tok_nextc(tok);
1150 if (c == '.')
1151 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001152#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001153 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001154 goto imaginary;
1155#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001156 if (c == 'x' || c == 'X') {
1157 /* Hex */
1158 do {
1159 c = tok_nextc(tok);
1160 } while (isxdigit(c));
1161 }
1162 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001163 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164 /* Octal; c is first char of it */
1165 /* There's no 'isoctdigit' macro, sigh */
1166 while ('0' <= c && c < '8') {
1167 c = tok_nextc(tok);
1168 }
Tim Petersd507dab2001-08-30 20:51:59 +00001169 if (isdigit(c)) {
1170 found_decimal = 1;
1171 do {
1172 c = tok_nextc(tok);
1173 } while (isdigit(c));
1174 }
1175 if (c == '.')
1176 goto fraction;
1177 else if (c == 'e' || c == 'E')
1178 goto exponent;
1179#ifndef WITHOUT_COMPLEX
1180 else if (c == 'j' || c == 'J')
1181 goto imaginary;
1182#endif
1183 else if (found_decimal) {
1184 tok->done = E_TOKEN;
1185 tok_backup(tok, c);
1186 return ERRORTOKEN;
1187 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001188 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001189 if (c == 'l' || c == 'L')
1190 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191 }
1192 else {
1193 /* Decimal */
1194 do {
1195 c = tok_nextc(tok);
1196 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001197 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001199 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001200 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001201 if (c == '.') {
1202 fraction:
1203 /* Fraction */
1204 do {
1205 c = tok_nextc(tok);
1206 } while (isdigit(c));
1207 }
1208 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001209 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001210 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001212 if (c == '+' || c == '-')
1213 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001214 if (!isdigit(c)) {
1215 tok->done = E_TOKEN;
1216 tok_backup(tok, c);
1217 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001218 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001219 do {
1220 c = tok_nextc(tok);
1221 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001223#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001224 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001225 /* Imaginary part */
1226 imaginary:
1227 c = tok_nextc(tok);
1228#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 }
1230 }
1231 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001232 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 *p_end = tok->cur;
1234 return NUMBER;
1235 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001236
1237 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001238 /* String */
1239 if (c == '\'' || c == '"') {
Guido van Rossum35685241998-02-16 15:42:50 +00001240 int quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001241 int quote = c;
1242 int triple = 0;
1243 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 for (;;) {
1245 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001246 if (c == '\n') {
1247 if (!triple) {
1248 tok->done = E_TOKEN;
1249 tok_backup(tok, c);
1250 return ERRORTOKEN;
1251 }
1252 tripcount = 0;
1253 }
1254 else if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001256 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 return ERRORTOKEN;
1258 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001259 else if (c == quote) {
1260 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001261 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001262 c = tok_nextc(tok);
1263 if (c == quote) {
1264 triple = 1;
1265 tripcount = 0;
1266 continue;
1267 }
1268 tok_backup(tok, c);
1269 }
1270 if (!triple || tripcount == 3)
1271 break;
1272 }
1273 else if (c == '\\') {
1274 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001276 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001278 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 return ERRORTOKEN;
1280 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001282 else
1283 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001285 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001286 *p_end = tok->cur;
1287 return STRING;
1288 }
1289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 /* Line continuation */
1291 if (c == '\\') {
1292 c = tok_nextc(tok);
1293 if (c != '\n') {
1294 tok->done = E_TOKEN;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001295 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 return ERRORTOKEN;
1297 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 goto again; /* Read next line */
1299 }
1300
Guido van Rossumfbab9051991-10-20 20:25:03 +00001301 /* Check for two-character token */
1302 {
1303 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001304 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001305 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001306 int c3 = tok_nextc(tok);
1307 int token3 = PyToken_ThreeChars(c, c2, c3);
1308 if (token3 != OP) {
1309 token = token3;
1310 } else {
1311 tok_backup(tok, c3);
1312 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001313 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001314 *p_end = tok->cur;
1315 return token;
1316 }
1317 tok_backup(tok, c2);
1318 }
1319
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001320 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001321 switch (c) {
1322 case '(':
1323 case '[':
1324 case '{':
1325 tok->level++;
1326 break;
1327 case ')':
1328 case ']':
1329 case '}':
1330 tok->level--;
1331 break;
1332 }
1333
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001335 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001337 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338}
1339
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001340int
1341PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1342{
1343 int result = tok_get(tok, p_start, p_end);
1344 if (tok->decoding_erred) {
1345 result = ERRORTOKEN;
1346 tok->done = E_DECODE;
1347 }
1348 return result;
1349}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350
Guido van Rossum408027e1996-12-30 16:17:54 +00001351#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352
1353void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001354tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355{
Guido van Rossum86bea461997-04-29 21:03:06 +00001356 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1358 printf("(%.*s)", (int)(end - start), start);
1359}
1360
1361#endif