blob: 0015daeb72536ca0fd1931b82013f612242b398d [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000030/* Convert a possibly signed character to a nonnegative int */
31/* XXX This assumes characters are 8 bits wide */
32#ifdef __CHAR_UNSIGNED__
33#define Py_CHARMASK(c) (c)
34#else
35#define Py_CHARMASK(c) ((c) & 0xff)
36#endif
37
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043/* Token names */
44
Guido van Rossum86bea461997-04-29 21:03:06 +000045char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000046 "ENDMARKER",
47 "NAME",
48 "NUMBER",
49 "STRING",
50 "NEWLINE",
51 "INDENT",
52 "DEDENT",
53 "LPAR",
54 "RPAR",
55 "LSQB",
56 "RSQB",
57 "COLON",
58 "COMMA",
59 "SEMI",
60 "PLUS",
61 "MINUS",
62 "STAR",
63 "SLASH",
64 "VBAR",
65 "AMPER",
66 "LESS",
67 "GREATER",
68 "EQUAL",
69 "DOT",
70 "PERCENT",
71 "BACKQUOTE",
72 "LBRACE",
73 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000074 "EQEQUAL",
75 "NOTEQUAL",
76 "LESSEQUAL",
77 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000078 "TILDE",
79 "CIRCUMFLEX",
80 "LEFTSHIFT",
81 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000082 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000083 "PLUSEQUAL",
84 "MINEQUAL",
85 "STAREQUAL",
86 "SLASHEQUAL",
87 "PERCENTEQUAL",
88 "AMPEREQUAL",
89 "VBAREQUAL",
90 "CIRCUMFLEXEQUAL",
91 "LEFTSHIFTEQUAL",
92 "RIGHTSHIFTEQUAL",
93 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000094 "DOUBLESLASH",
95 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000096 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Anthony Baxter11490022006-04-11 05:39:14 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
125 tok->altwarning = 0;
126 tok->alterror = 0;
127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Neal Norwitz08062d62006-04-11 08:19:15 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
320 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
321 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
322#if 0
323 /* Disable support for UTF-16 BOMs until a decision
324 is made whether this needs to be supported. */
325 } else if (ch == 0xFE) {
326 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
327 if (!set_readline(tok, "utf-16-be")) return 0;
328 tok->decoding_state = -1;
329 } else if (ch == 0xFF) {
330 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
331 if (!set_readline(tok, "utf-16-le")) return 0;
332 tok->decoding_state = -1;
333#endif
334 } else {
335 unget_char(ch, tok);
336 return 1;
337 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000338 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000339 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
341 return 1;
342 NON_BOM:
343 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
344 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
345 return 1;
346}
347
348/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000349 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000350
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351 On entry, tok->decoding_buffer will be one of:
352 1) NULL: need to call tok->decoding_readline to get a new line
353 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
354 stored the result in tok->decoding_buffer
355 3) PyStringObject *: previous call to fp_readl did not have enough room
356 (in the s buffer) to copy entire contents of the line read
357 by tok->decoding_readline. tok->decoding_buffer has the overflow.
358 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000359 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000360 reached): see tok_nextc and its calls to decoding_fgets.
361*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000362
363static char *
364fp_readl(char *s, int size, struct tok_state *tok)
365{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000366#ifndef Py_USING_UNICODE
367 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000368 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000369 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000370#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000374 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000382 if (buf == NULL)
383 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384 } else {
385 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 if (PyString_CheckExact(buf))
387 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000388 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 if (utf8 == NULL) {
390 utf8 = PyUnicode_AsUTF8String(buf);
391 Py_DECREF(buf);
392 if (utf8 == NULL)
393 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395 str = PyString_AsString(utf8);
396 utf8len = PyString_GET_SIZE(utf8);
397 if (utf8len > size) {
398 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
399 if (tok->decoding_buffer == NULL) {
400 Py_DECREF(utf8);
401 return error_ret(tok);
402 }
403 utf8len = size;
404 }
405 memcpy(s, str, utf8len);
406 s[utf8len] = '\0';
407 Py_DECREF(utf8);
408 if (utf8len == 0) return NULL; /* EOF */
409 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000410#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000411}
412
413/* Set the readline function for TOK to a StreamReader's
414 readline function. The StreamReader is named ENC.
415
416 This function is called from check_bom and check_coding_spec.
417
418 ENC is usually identical to the future value of tok->encoding,
419 except for the (currently unsupported) case of UTF-16.
420
421 Return 1 on success, 0 on failure. */
422
423static int
424fp_setreadl(struct tok_state *tok, const char* enc)
425{
426 PyObject *reader, *stream, *readline;
427
Martin v. Löwis95292d62002-12-11 14:04:59 +0000428 /* XXX: constify filename argument. */
429 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000430 if (stream == NULL)
431 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000432
433 reader = PyCodec_StreamReader(enc, stream, NULL);
434 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000435 if (reader == NULL)
436 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437
438 readline = PyObject_GetAttrString(reader, "readline");
439 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000440 if (readline == NULL)
441 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000442
443 tok->decoding_readline = readline;
444 return 1;
445}
446
447/* Fetch the next byte from TOK. */
448
449static int fp_getc(struct tok_state *tok) {
450 return getc(tok->fp);
451}
452
453/* Unfetch the last byte back into TOK. */
454
455static void fp_ungetc(int c, struct tok_state *tok) {
456 ungetc(c, tok->fp);
457}
458
459/* Read a line of input from TOK. Determine encoding
460 if necessary. */
461
462static char *
463decoding_fgets(char *s, int size, struct tok_state *tok)
464{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000465 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000466 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000467 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 if (tok->decoding_state < 0) {
469 /* We already have a codec associated with
470 this input. */
471 line = fp_readl(s, size, tok);
472 break;
473 } else if (tok->decoding_state > 0) {
474 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000475 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477 break;
478 } else {
479 /* We have not yet determined the encoding.
480 If an encoding is found, use the file-pointer
481 reader functions from now on. */
482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483 return error_ret(tok);
484 assert(tok->decoding_state != 0);
485 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000486 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489 return error_ret(tok);
490 }
491 }
492#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000493 /* The default encoding is ASCII, so make sure we don't have any
494 non-ASCII bytes in it. */
495 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000497 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 if (*c > 127) {
499 badchar = *c;
500 break;
501 }
502 }
503 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000504 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000505 /* Need to add 1 to the line number, since this line
506 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000507 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000508 "Non-ASCII character '\\x%.2x' "
509 "in file %.200s on line %i, "
510 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000511 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000512 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000513 PyErr_SetString(PyExc_SyntaxError, buf);
514 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 }
516#endif
517 return line;
518}
519
520static int
521decoding_feof(struct tok_state *tok)
522{
523 if (tok->decoding_state >= 0) {
524 return feof(tok->fp);
525 } else {
526 PyObject* buf = tok->decoding_buffer;
527 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000528 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 if (buf == NULL) {
530 error_ret(tok);
531 return 1;
532 } else {
533 tok->decoding_buffer = buf;
534 }
535 }
536 return PyObject_Length(buf) == 0;
537 }
538}
539
540/* Fetch a byte from TOK, using the string buffer. */
541
Tim Petersc9d78aa2006-03-26 23:27:58 +0000542static int
543buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000544 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Unfetch a byte from TOK, using the string buffer. */
548
Tim Petersc9d78aa2006-03-26 23:27:58 +0000549static void
550buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Set the readline function for TOK to ENC. For the string-based
556 tokenizer, this means to just record the encoding. */
557
Tim Petersc9d78aa2006-03-26 23:27:58 +0000558static int
559buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 tok->enc = enc;
561 return 1;
562}
563
564/* Return a UTF-8 encoding Python string object from the
565 C byte string STR, which is encoded with ENC. */
566
Martin v. Löwis019934b2002-08-07 12:33:18 +0000567#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568static PyObject *
569translate_into_utf8(const char* str, const char* enc) {
570 PyObject *utf8;
571 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
572 if (buf == NULL)
573 return NULL;
574 utf8 = PyUnicode_AsUTF8String(buf);
575 Py_DECREF(buf);
576 return utf8;
577}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000578#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579
580/* Decode a byte string STR for use as the buffer of TOK.
581 Look for encoding declarations inside STR, and record them
582 inside TOK. */
583
584static const char *
585decode_str(const char *str, struct tok_state *tok)
586{
587 PyObject* utf8 = NULL;
588 const char *s;
589 int lineno = 0;
590 tok->enc = NULL;
591 tok->str = str;
592 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000595 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 if (tok->enc != NULL) {
598 utf8 = translate_into_utf8(str, tok->enc);
599 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000600 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 str = PyString_AsString(utf8);
602 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000603#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 for (s = str;; s++) {
605 if (*s == '\0') break;
606 else if (*s == '\n') {
607 lineno++;
608 if (lineno == 2) break;
609 }
610 }
611 tok->enc = NULL;
612 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000613 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000614#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 if (tok->enc != NULL) {
616 assert(utf8 == NULL);
617 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000618 if (utf8 == NULL) {
619 PyErr_Format(PyExc_SyntaxError,
620 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000621 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000622 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 str = PyString_AsString(utf8);
624 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000625#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626 assert(tok->decoding_buffer == NULL);
627 tok->decoding_buffer = utf8; /* CAUTION */
628 return str;
629}
630
631#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000632
633/* Set up tokenizer for string */
634
635struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000636PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000637{
638 struct tok_state *tok = tok_new();
639 if (tok == NULL)
640 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000642 if (str == NULL) {
643 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000645 }
646
Martin v. Löwis95292d62002-12-11 14:04:59 +0000647 /* XXX: constify members. */
648 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649 return tok;
650}
651
652
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000653/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000654
655struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000656PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657{
658 struct tok_state *tok = tok_new();
659 if (tok == NULL)
660 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000661 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000662 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000663 return NULL;
664 }
665 tok->cur = tok->inp = tok->buf;
666 tok->end = tok->buf + BUFSIZ;
667 tok->fp = fp;
668 tok->prompt = ps1;
669 tok->nextprompt = ps2;
670 return tok;
671}
672
673
674/* Free a tok_state structure */
675
676void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000677PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000680 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000681#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682 Py_XDECREF(tok->decoding_readline);
683 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000684#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000685 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000686 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000687 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000688}
689
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000690#if !defined(PGEN) && defined(Py_USING_UNICODE)
691static int
692tok_stdin_decode(struct tok_state *tok, char **inp)
693{
694 PyObject *enc, *sysstdin, *decoded, *utf8;
695 const char *encoding;
696 char *converted;
697
698 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
699 return 0;
700 sysstdin = PySys_GetObject("stdin");
701 if (sysstdin == NULL || !PyFile_Check(sysstdin))
702 return 0;
703
704 enc = ((PyFileObject *)sysstdin)->f_encoding;
705 if (enc == NULL || !PyString_Check(enc))
706 return 0;
707 Py_INCREF(enc);
708
709 encoding = PyString_AsString(enc);
710 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
711 if (decoded == NULL)
712 goto error_clear;
713
714 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
715 Py_DECREF(decoded);
716 if (utf8 == NULL)
717 goto error_clear;
718
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000719 assert(PyString_Check(utf8));
720 converted = new_string(PyString_AS_STRING(utf8),
721 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000722 Py_DECREF(utf8);
723 if (converted == NULL)
724 goto error_nomem;
725
Neal Norwitz08062d62006-04-11 08:19:15 +0000726 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000727 *inp = converted;
728 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000729 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000730 tok->encoding = new_string(encoding, strlen(encoding));
731 if (tok->encoding == NULL)
732 goto error_nomem;
733
734 Py_DECREF(enc);
735 return 0;
736
737error_nomem:
738 Py_DECREF(enc);
739 tok->done = E_NOMEM;
740 return -1;
741
742error_clear:
743 /* Fallback to iso-8859-1: for backward compatibility */
744 Py_DECREF(enc);
745 PyErr_Clear();
746 return 0;
747}
748#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749
750/* Get next char, updating state; error code goes into tok->done */
751
752static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000753tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000756 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000757 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000758 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000759 if (tok->done != E_OK)
760 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000762 char *end = strchr(tok->inp, '\n');
763 if (end != NULL)
764 end++;
765 else {
766 end = strchr(tok->inp, '\0');
767 if (end == tok->inp) {
768 tok->done = E_EOF;
769 return EOF;
770 }
771 }
772 if (tok->start == NULL)
773 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000774 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000775 tok->lineno++;
776 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000777 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000780 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781 if (tok->nextprompt != NULL)
782 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000783 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000784 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000785 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000786 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787 tok->done = E_EOF;
788 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000789#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000790 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000791 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000792#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000794 size_t start = tok->start - tok->buf;
795 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000796 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000797 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000798 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000799 tok->lineno++;
800 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000801 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000802 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000803 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000804 tok->done = E_NOMEM;
805 return EOF;
806 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 tok->buf = buf;
808 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000809 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000810 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000811 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000812 tok->inp = tok->buf + newlen;
813 tok->end = tok->inp + 1;
814 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000815 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 else {
817 tok->lineno++;
818 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000819 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000820 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000821 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000823 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 tok->inp = strchr(tok->buf, '\0');
825 tok->end = tok->inp + 1;
826 }
827 }
828 else {
829 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000830 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000831 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 if (tok->start == NULL) {
833 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000834 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000835 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 if (tok->buf == NULL) {
837 tok->done = E_NOMEM;
838 return EOF;
839 }
840 tok->end = tok->buf + BUFSIZ;
841 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000842 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
843 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 tok->done = E_EOF;
845 done = 1;
846 }
847 else {
848 tok->done = E_OK;
849 tok->inp = strchr(tok->buf, '\0');
850 done = tok->inp[-1] == '\n';
851 }
852 }
853 else {
854 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000855 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000856 tok->done = E_EOF;
857 done = 1;
858 }
859 else
860 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000861 }
862 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000865 Py_ssize_t curstart = tok->start == NULL ? -1 :
866 tok->start - tok->buf;
867 Py_ssize_t curvalid = tok->inp - tok->buf;
868 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000869 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000870 newbuf = (char *)PyMem_REALLOC(newbuf,
871 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000872 if (newbuf == NULL) {
873 tok->done = E_NOMEM;
874 tok->cur = tok->inp;
875 return EOF;
876 }
877 tok->buf = newbuf;
878 tok->inp = tok->buf + curvalid;
879 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->start = curstart < 0 ? NULL :
881 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000882 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000883 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000884 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000885 /* Break out early on decoding
886 errors, as tok->buf will be NULL
887 */
888 if (tok->decoding_erred)
889 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 /* Last line does not end in \n,
891 fake one */
892 strcpy(tok->inp, "\n");
893 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000895 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000897 if (tok->buf != NULL) {
898 tok->cur = tok->buf + cur;
899 tok->line_start = tok->cur;
900 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000901 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000902 pt = tok->inp - 2;
903 if (pt >= tok->buf && *pt == '\r') {
904 *pt++ = '\n';
905 *pt = '\0';
906 tok->inp = pt;
907 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000908 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909 }
910 if (tok->done != E_OK) {
911 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000912 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000913 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914 return EOF;
915 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000917 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918}
919
920
921/* Back-up one character */
922
923static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000924tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000925{
926 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000927 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000928 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000929 if (*tok->cur != c)
930 *tok->cur = c;
931 }
932}
933
934
935/* Return the token corresponding to a single character */
936
937int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000938PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000939{
940 switch (c) {
941 case '(': return LPAR;
942 case ')': return RPAR;
943 case '[': return LSQB;
944 case ']': return RSQB;
945 case ':': return COLON;
946 case ',': return COMMA;
947 case ';': return SEMI;
948 case '+': return PLUS;
949 case '-': return MINUS;
950 case '*': return STAR;
951 case '/': return SLASH;
952 case '|': return VBAR;
953 case '&': return AMPER;
954 case '<': return LESS;
955 case '>': return GREATER;
956 case '=': return EQUAL;
957 case '.': return DOT;
958 case '%': return PERCENT;
959 case '`': return BACKQUOTE;
960 case '{': return LBRACE;
961 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000962 case '^': return CIRCUMFLEX;
963 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000964 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000965 default: return OP;
966 }
967}
968
969
Guido van Rossumfbab9051991-10-20 20:25:03 +0000970int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000971PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000972{
973 switch (c1) {
974 case '=':
975 switch (c2) {
976 case '=': return EQEQUAL;
977 }
978 break;
979 case '!':
980 switch (c2) {
981 case '=': return NOTEQUAL;
982 }
983 break;
984 case '<':
985 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +0000986 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000987 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000988 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000989 }
990 break;
991 case '>':
992 switch (c2) {
993 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000994 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000995 }
996 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000997 case '+':
998 switch (c2) {
999 case '=': return PLUSEQUAL;
1000 }
1001 break;
1002 case '-':
1003 switch (c2) {
1004 case '=': return MINEQUAL;
1005 }
1006 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001007 case '*':
1008 switch (c2) {
1009 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001010 case '=': return STAREQUAL;
1011 }
1012 break;
1013 case '/':
1014 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001015 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001016 case '=': return SLASHEQUAL;
1017 }
1018 break;
1019 case '|':
1020 switch (c2) {
1021 case '=': return VBAREQUAL;
1022 }
1023 break;
1024 case '%':
1025 switch (c2) {
1026 case '=': return PERCENTEQUAL;
1027 }
1028 break;
1029 case '&':
1030 switch (c2) {
1031 case '=': return AMPEREQUAL;
1032 }
1033 break;
1034 case '^':
1035 switch (c2) {
1036 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001037 }
1038 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001039 }
1040 return OP;
1041}
1042
Thomas Wouters434d0822000-08-24 20:11:32 +00001043int
1044PyToken_ThreeChars(int c1, int c2, int c3)
1045{
1046 switch (c1) {
1047 case '<':
1048 switch (c2) {
1049 case '<':
1050 switch (c3) {
1051 case '=':
1052 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001053 }
1054 break;
1055 }
1056 break;
1057 case '>':
1058 switch (c2) {
1059 case '>':
1060 switch (c3) {
1061 case '=':
1062 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001063 }
1064 break;
1065 }
1066 break;
1067 case '*':
1068 switch (c2) {
1069 case '*':
1070 switch (c3) {
1071 case '=':
1072 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001073 }
1074 break;
1075 }
1076 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001077 case '/':
1078 switch (c2) {
1079 case '/':
1080 switch (c3) {
1081 case '=':
1082 return DOUBLESLASHEQUAL;
1083 }
1084 break;
1085 }
1086 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001087 }
1088 return OP;
1089}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001090
Guido van Rossum926f13a1998-04-09 21:38:06 +00001091static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001092indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001093{
1094 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001095 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001096 tok->cur = tok->inp;
1097 return 1;
1098 }
1099 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001100 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1101 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001102 tok->altwarning = 0;
1103 }
1104 return 0;
1105}
1106
1107
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108/* Get next token, after space stripping etc. */
1109
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001110static int
1111tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112{
1113 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001114 int blankline;
1115
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001116 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001117 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001118 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001119 blankline = 0;
1120
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001121 /* Get indentation level */
1122 if (tok->atbol) {
1123 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001124 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001125 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126 for (;;) {
1127 c = tok_nextc(tok);
1128 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001129 col++, altcol++;
1130 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001131 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001132 altcol = (altcol/tok->alttabsize + 1)
1133 * tok->alttabsize;
1134 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001135 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001136 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137 else
1138 break;
1139 }
1140 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001141 if (c == '#' || c == '\n') {
1142 /* Lines with only whitespace and/or comments
1143 shouldn't affect the indentation and are
1144 not passed to the parser as NEWLINE tokens,
1145 except *totally* empty lines in interactive
1146 mode, which signal the end of a command group. */
1147 if (col == 0 && c == '\n' && tok->prompt != NULL)
1148 blankline = 0; /* Let it through */
1149 else
1150 blankline = 1; /* Ignore completely */
1151 /* We can't jump back right here since we still
1152 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001154 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001155 if (col == tok->indstack[tok->indent]) {
1156 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001157 if (altcol != tok->altindstack[tok->indent]) {
1158 if (indenterror(tok))
1159 return ERRORTOKEN;
1160 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001161 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001162 else if (col > tok->indstack[tok->indent]) {
1163 /* Indent -- always one */
1164 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001165 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001166 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001167 return ERRORTOKEN;
1168 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001169 if (altcol <= tok->altindstack[tok->indent]) {
1170 if (indenterror(tok))
1171 return ERRORTOKEN;
1172 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001173 tok->pendin++;
1174 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001175 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 else /* col < tok->indstack[tok->indent] */ {
1178 /* Dedent -- any number, must be consistent */
1179 while (tok->indent > 0 &&
1180 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001181 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001182 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001183 }
1184 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001185 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001186 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001187 return ERRORTOKEN;
1188 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001189 if (altcol != tok->altindstack[tok->indent]) {
1190 if (indenterror(tok))
1191 return ERRORTOKEN;
1192 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193 }
1194 }
1195 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001196
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001197 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001198
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 /* Return pending indents/dedents */
1200 if (tok->pendin != 0) {
1201 if (tok->pendin < 0) {
1202 tok->pendin++;
1203 return DEDENT;
1204 }
1205 else {
1206 tok->pendin--;
1207 return INDENT;
1208 }
1209 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001210
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001212 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 /* Skip spaces */
1214 do {
1215 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001216 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001217
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001218 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001219 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001220
Guido van Rossumab5ca152000-03-31 00:52:27 +00001221 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001223 static char *tabforms[] = {
1224 "tab-width:", /* Emacs */
1225 ":tabstop=", /* vim, full form */
1226 ":ts=", /* vim, abbreviated form */
1227 "set tabsize=", /* will vi never die? */
1228 /* more templates can be added here to support other editors */
1229 };
1230 char cbuf[80];
1231 char *tp, **cp;
1232 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001234 *tp++ = c = tok_nextc(tok);
1235 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001236 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001237 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001238 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001239 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1240 cp++) {
1241 if ((tp = strstr(cbuf, *cp))) {
1242 int newsize = atoi(tp + strlen(*cp));
1243
1244 if (newsize >= 1 && newsize <= 40) {
1245 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001246 if (Py_VerboseFlag)
1247 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001248 "Tab size set to %d\n",
1249 newsize);
1250 }
1251 }
1252 }
1253 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001256
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001258 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001260 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001261
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001262 /* Identifier (most frequent token!) */
1263 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001264 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001265 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001266 case 'b':
1267 case 'B':
1268 c = tok_nextc(tok);
1269 if (c == 'r' || c == 'R')
1270 c = tok_nextc(tok);
1271 if (c == '"' || c == '\'')
1272 goto letter_quote;
1273 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001274 case 'r':
1275 case 'R':
1276 c = tok_nextc(tok);
1277 if (c == '"' || c == '\'')
1278 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001279 break;
1280 case 'u':
1281 case 'U':
1282 c = tok_nextc(tok);
1283 if (c == 'r' || c == 'R')
1284 c = tok_nextc(tok);
1285 if (c == '"' || c == '\'')
1286 goto letter_quote;
1287 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001288 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001289 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001291 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001293 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 *p_end = tok->cur;
1295 return NAME;
1296 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001297
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 /* Newline */
1299 if (c == '\n') {
1300 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001301 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001302 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001303 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001305 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 return NEWLINE;
1307 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001308
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001309 /* Period or number starting with period? */
1310 if (c == '.') {
1311 c = tok_nextc(tok);
1312 if (isdigit(c)) {
1313 goto fraction;
1314 }
1315 else {
1316 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001317 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001318 *p_end = tok->cur;
1319 return DOT;
1320 }
1321 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001322
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001323 /* Number */
1324 if (isdigit(c)) {
1325 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001326 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 c = tok_nextc(tok);
1328 if (c == '.')
1329 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001330#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001331 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001332 goto imaginary;
1333#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 if (c == 'x' || c == 'X') {
1335 /* Hex */
1336 do {
1337 c = tok_nextc(tok);
1338 } while (isxdigit(c));
1339 }
1340 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001341 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342 /* Octal; c is first char of it */
1343 /* There's no 'isoctdigit' macro, sigh */
1344 while ('0' <= c && c < '8') {
1345 c = tok_nextc(tok);
1346 }
Tim Petersd507dab2001-08-30 20:51:59 +00001347 if (isdigit(c)) {
1348 found_decimal = 1;
1349 do {
1350 c = tok_nextc(tok);
1351 } while (isdigit(c));
1352 }
1353 if (c == '.')
1354 goto fraction;
1355 else if (c == 'e' || c == 'E')
1356 goto exponent;
1357#ifndef WITHOUT_COMPLEX
1358 else if (c == 'j' || c == 'J')
1359 goto imaginary;
1360#endif
1361 else if (found_decimal) {
1362 tok->done = E_TOKEN;
1363 tok_backup(tok, c);
1364 return ERRORTOKEN;
1365 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001367 if (c == 'l' || c == 'L')
1368 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 }
1370 else {
1371 /* Decimal */
1372 do {
1373 c = tok_nextc(tok);
1374 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001375 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001377 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001378 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001379 if (c == '.') {
1380 fraction:
1381 /* Fraction */
1382 do {
1383 c = tok_nextc(tok);
1384 } while (isdigit(c));
1385 }
1386 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001387 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001388 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001390 if (c == '+' || c == '-')
1391 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001392 if (!isdigit(c)) {
1393 tok->done = E_TOKEN;
1394 tok_backup(tok, c);
1395 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001396 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001397 do {
1398 c = tok_nextc(tok);
1399 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001400 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001401#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001402 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001403 /* Imaginary part */
1404 imaginary:
1405 c = tok_nextc(tok);
1406#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 }
1408 }
1409 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 *p_end = tok->cur;
1412 return NUMBER;
1413 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001414
1415 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001416 /* String */
1417 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001418 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001419 int quote = c;
1420 int triple = 0;
1421 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 for (;;) {
1423 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 if (c == '\n') {
1425 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001426 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 tok_backup(tok, c);
1428 return ERRORTOKEN;
1429 }
1430 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001431 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001432 }
1433 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001434 if (triple)
1435 tok->done = E_EOFS;
1436 else
1437 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001438 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001439 return ERRORTOKEN;
1440 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001441 else if (c == quote) {
1442 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001443 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001444 c = tok_nextc(tok);
1445 if (c == quote) {
1446 triple = 1;
1447 tripcount = 0;
1448 continue;
1449 }
1450 tok_backup(tok, c);
1451 }
1452 if (!triple || tripcount == 3)
1453 break;
1454 }
1455 else if (c == '\\') {
1456 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001457 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001458 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001459 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001460 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001461 return ERRORTOKEN;
1462 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001464 else
1465 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001467 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001468 *p_end = tok->cur;
1469 return STRING;
1470 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001471
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001472 /* Line continuation */
1473 if (c == '\\') {
1474 c = tok_nextc(tok);
1475 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001476 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001477 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478 return ERRORTOKEN;
1479 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001480 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 goto again; /* Read next line */
1482 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001483
Guido van Rossumfbab9051991-10-20 20:25:03 +00001484 /* Check for two-character token */
1485 {
1486 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001487 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001488#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001489 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001490 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1491 "<> not supported in 3.x",
1492 tok->filename, tok->lineno,
1493 NULL, NULL)) {
1494 return ERRORTOKEN;
1495 }
1496 }
1497#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001498 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001499 int c3 = tok_nextc(tok);
1500 int token3 = PyToken_ThreeChars(c, c2, c3);
1501 if (token3 != OP) {
1502 token = token3;
1503 } else {
1504 tok_backup(tok, c3);
1505 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001506 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001507 *p_end = tok->cur;
1508 return token;
1509 }
1510 tok_backup(tok, c2);
1511 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001512
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001513 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001514 switch (c) {
1515 case '(':
1516 case '[':
1517 case '{':
1518 tok->level++;
1519 break;
1520 case ')':
1521 case ']':
1522 case '}':
1523 tok->level--;
1524 break;
1525 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001526
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001527 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001528 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001529 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001530 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001531}
1532
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001533int
1534PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1535{
1536 int result = tok_get(tok, p_start, p_end);
1537 if (tok->decoding_erred) {
1538 result = ERRORTOKEN;
1539 tok->done = E_DECODE;
1540 }
1541 return result;
1542}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001543
Martin v. Löwisa5136192007-09-04 14:19:28 +00001544/* This function is only called from parsetok. However, it cannot live
1545 there, as it must be empty for PGEN, and we can check for PGEN only
1546 in this file. */
1547
1548#ifdef PGEN
1549char*
1550PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1551{
1552 return NULL;
1553}
1554#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001555#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001556static PyObject *
1557dec_utf8(const char *enc, const char *text, size_t len) {
1558 PyObject *ret = NULL;
1559 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1560 if (unicode_text) {
1561 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1562 Py_DECREF(unicode_text);
1563 }
1564 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001565 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001566 }
1567 return ret;
1568}
1569
1570char *
1571PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1572{
1573 char *text = NULL;
1574 if (tok->encoding) {
1575 /* convert source to original encondig */
1576 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1577 if (lineobj != NULL) {
1578 int linelen = PyString_Size(lineobj);
1579 const char *line = PyString_AsString(lineobj);
1580 text = PyObject_MALLOC(linelen + 1);
1581 if (text != NULL && line != NULL) {
1582 if (linelen)
1583 strncpy(text, line, linelen);
1584 text[linelen] = '\0';
1585 }
1586 Py_DECREF(lineobj);
1587
1588 /* adjust error offset */
1589 if (*offset > 1) {
1590 PyObject *offsetobj = dec_utf8(tok->encoding,
1591 tok->buf, *offset-1);
1592 if (offsetobj) {
1593 *offset = PyString_Size(offsetobj) + 1;
1594 Py_DECREF(offsetobj);
1595 }
1596 }
1597
1598 }
1599 }
1600 return text;
1601
1602}
Georg Brandl76b30d12008-01-07 18:41:34 +00001603#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001604#endif
1605
Martin v. Löwisa5136192007-09-04 14:19:28 +00001606
Guido van Rossum408027e1996-12-30 16:17:54 +00001607#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001608
1609void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001610tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001611{
Guido van Rossum86bea461997-04-29 21:03:06 +00001612 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001613 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1614 printf("(%.*s)", (int)(end - start), start);
1615}
1616
1617#endif