blob: 1314f5f1398bf20e02ab2ce62606681ceae3553e [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000030/* Convert a possibly signed character to a nonnegative int */
31/* XXX This assumes characters are 8 bits wide */
32#ifdef __CHAR_UNSIGNED__
33#define Py_CHARMASK(c) (c)
34#else
35#define Py_CHARMASK(c) ((c) & 0xff)
36#endif
37
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043/* Token names */
44
Guido van Rossum86bea461997-04-29 21:03:06 +000045char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000046 "ENDMARKER",
47 "NAME",
48 "NUMBER",
49 "STRING",
50 "NEWLINE",
51 "INDENT",
52 "DEDENT",
53 "LPAR",
54 "RPAR",
55 "LSQB",
56 "RSQB",
57 "COLON",
58 "COMMA",
59 "SEMI",
60 "PLUS",
61 "MINUS",
62 "STAR",
63 "SLASH",
64 "VBAR",
65 "AMPER",
66 "LESS",
67 "GREATER",
68 "EQUAL",
69 "DOT",
70 "PERCENT",
71 "BACKQUOTE",
72 "LBRACE",
73 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000074 "EQEQUAL",
75 "NOTEQUAL",
76 "LESSEQUAL",
77 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000078 "TILDE",
79 "CIRCUMFLEX",
80 "LEFTSHIFT",
81 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000082 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000083 "PLUSEQUAL",
84 "MINEQUAL",
85 "STAREQUAL",
86 "SLASHEQUAL",
87 "PERCENTEQUAL",
88 "AMPEREQUAL",
89 "VBAREQUAL",
90 "CIRCUMFLEXEQUAL",
91 "LEFTSHIFTEQUAL",
92 "RIGHTSHIFTEQUAL",
93 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000094 "DOUBLESLASH",
95 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000096 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Anthony Baxter11490022006-04-11 05:39:14 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
125 tok->altwarning = 0;
126 tok->alterror = 0;
127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Neal Norwitz08062d62006-04-11 08:19:15 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
320 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
321 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
322#if 0
323 /* Disable support for UTF-16 BOMs until a decision
324 is made whether this needs to be supported. */
325 } else if (ch == 0xFE) {
326 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
327 if (!set_readline(tok, "utf-16-be")) return 0;
328 tok->decoding_state = -1;
329 } else if (ch == 0xFF) {
330 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
331 if (!set_readline(tok, "utf-16-le")) return 0;
332 tok->decoding_state = -1;
333#endif
334 } else {
335 unget_char(ch, tok);
336 return 1;
337 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000338 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000339 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
341 return 1;
342 NON_BOM:
343 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
344 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
345 return 1;
346}
347
348/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000349 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000350
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351 On entry, tok->decoding_buffer will be one of:
352 1) NULL: need to call tok->decoding_readline to get a new line
353 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
354 stored the result in tok->decoding_buffer
355 3) PyStringObject *: previous call to fp_readl did not have enough room
356 (in the s buffer) to copy entire contents of the line read
357 by tok->decoding_readline. tok->decoding_buffer has the overflow.
358 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000359 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000360 reached): see tok_nextc and its calls to decoding_fgets.
361*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000362
363static char *
364fp_readl(char *s, int size, struct tok_state *tok)
365{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000366#ifndef Py_USING_UNICODE
367 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000368 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000369 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000370#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000374 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000382 if (buf == NULL)
383 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384 } else {
385 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 if (PyString_CheckExact(buf))
387 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000388 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 if (utf8 == NULL) {
390 utf8 = PyUnicode_AsUTF8String(buf);
391 Py_DECREF(buf);
392 if (utf8 == NULL)
393 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395 str = PyString_AsString(utf8);
396 utf8len = PyString_GET_SIZE(utf8);
397 if (utf8len > size) {
398 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
399 if (tok->decoding_buffer == NULL) {
400 Py_DECREF(utf8);
401 return error_ret(tok);
402 }
403 utf8len = size;
404 }
405 memcpy(s, str, utf8len);
406 s[utf8len] = '\0';
407 Py_DECREF(utf8);
408 if (utf8len == 0) return NULL; /* EOF */
409 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000410#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000411}
412
413/* Set the readline function for TOK to a StreamReader's
414 readline function. The StreamReader is named ENC.
415
416 This function is called from check_bom and check_coding_spec.
417
418 ENC is usually identical to the future value of tok->encoding,
419 except for the (currently unsupported) case of UTF-16.
420
421 Return 1 on success, 0 on failure. */
422
423static int
424fp_setreadl(struct tok_state *tok, const char* enc)
425{
426 PyObject *reader, *stream, *readline;
427
Martin v. Löwis95292d62002-12-11 14:04:59 +0000428 /* XXX: constify filename argument. */
429 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000430 if (stream == NULL)
431 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000432
433 reader = PyCodec_StreamReader(enc, stream, NULL);
434 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000435 if (reader == NULL)
436 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437
438 readline = PyObject_GetAttrString(reader, "readline");
439 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000440 if (readline == NULL)
441 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000442
443 tok->decoding_readline = readline;
444 return 1;
445}
446
447/* Fetch the next byte from TOK. */
448
449static int fp_getc(struct tok_state *tok) {
450 return getc(tok->fp);
451}
452
453/* Unfetch the last byte back into TOK. */
454
455static void fp_ungetc(int c, struct tok_state *tok) {
456 ungetc(c, tok->fp);
457}
458
459/* Read a line of input from TOK. Determine encoding
460 if necessary. */
461
462static char *
463decoding_fgets(char *s, int size, struct tok_state *tok)
464{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000465 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000466 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000467 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 if (tok->decoding_state < 0) {
469 /* We already have a codec associated with
470 this input. */
471 line = fp_readl(s, size, tok);
472 break;
473 } else if (tok->decoding_state > 0) {
474 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000475 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477 break;
478 } else {
479 /* We have not yet determined the encoding.
480 If an encoding is found, use the file-pointer
481 reader functions from now on. */
482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483 return error_ret(tok);
484 assert(tok->decoding_state != 0);
485 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000486 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489 return error_ret(tok);
490 }
491 }
492#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000493 /* The default encoding is ASCII, so make sure we don't have any
494 non-ASCII bytes in it. */
495 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000497 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 if (*c > 127) {
499 badchar = *c;
500 break;
501 }
502 }
503 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000504 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000505 /* Need to add 1 to the line number, since this line
506 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000507 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000508 "Non-ASCII character '\\x%.2x' "
509 "in file %.200s on line %i, "
510 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000511 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000512 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000513 PyErr_SetString(PyExc_SyntaxError, buf);
514 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 }
516#endif
517 return line;
518}
519
520static int
521decoding_feof(struct tok_state *tok)
522{
523 if (tok->decoding_state >= 0) {
524 return feof(tok->fp);
525 } else {
526 PyObject* buf = tok->decoding_buffer;
527 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000528 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 if (buf == NULL) {
530 error_ret(tok);
531 return 1;
532 } else {
533 tok->decoding_buffer = buf;
534 }
535 }
536 return PyObject_Length(buf) == 0;
537 }
538}
539
540/* Fetch a byte from TOK, using the string buffer. */
541
Tim Petersc9d78aa2006-03-26 23:27:58 +0000542static int
543buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000544 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Unfetch a byte from TOK, using the string buffer. */
548
Tim Petersc9d78aa2006-03-26 23:27:58 +0000549static void
550buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Set the readline function for TOK to ENC. For the string-based
556 tokenizer, this means to just record the encoding. */
557
Tim Petersc9d78aa2006-03-26 23:27:58 +0000558static int
559buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 tok->enc = enc;
561 return 1;
562}
563
564/* Return a UTF-8 encoding Python string object from the
565 C byte string STR, which is encoded with ENC. */
566
Martin v. Löwis019934b2002-08-07 12:33:18 +0000567#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568static PyObject *
569translate_into_utf8(const char* str, const char* enc) {
570 PyObject *utf8;
571 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
572 if (buf == NULL)
573 return NULL;
574 utf8 = PyUnicode_AsUTF8String(buf);
575 Py_DECREF(buf);
576 return utf8;
577}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000578#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579
580/* Decode a byte string STR for use as the buffer of TOK.
581 Look for encoding declarations inside STR, and record them
582 inside TOK. */
583
584static const char *
585decode_str(const char *str, struct tok_state *tok)
586{
587 PyObject* utf8 = NULL;
588 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000589 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 int lineno = 0;
591 tok->enc = NULL;
592 tok->str = str;
593 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000594 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000596 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000597#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598 if (tok->enc != NULL) {
599 utf8 = translate_into_utf8(str, tok->enc);
600 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000601 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 str = PyString_AsString(utf8);
603 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000604#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605 for (s = str;; s++) {
606 if (*s == '\0') break;
607 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000608 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000609 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000610 lineno++;
611 if (lineno == 2) break;
612 }
613 }
614 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000615 /* need to check line 1 and 2 separately since check_coding_spec
616 assumes a single line as input */
617 if (newl[0]) {
618 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
619 return error_ret(tok);
620 if (tok->enc == NULL && newl[1]) {
621 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
622 tok, buf_setreadl))
623 return error_ret(tok);
624 }
625 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000626#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627 if (tok->enc != NULL) {
628 assert(utf8 == NULL);
629 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000630 if (utf8 == NULL) {
631 PyErr_Format(PyExc_SyntaxError,
632 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000633 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000634 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635 str = PyString_AsString(utf8);
636 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000637#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638 assert(tok->decoding_buffer == NULL);
639 tok->decoding_buffer = utf8; /* CAUTION */
640 return str;
641}
642
643#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000644
645/* Set up tokenizer for string */
646
647struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000648PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649{
650 struct tok_state *tok = tok_new();
651 if (tok == NULL)
652 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000654 if (str == NULL) {
655 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000657 }
658
Martin v. Löwis95292d62002-12-11 14:04:59 +0000659 /* XXX: constify members. */
660 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661 return tok;
662}
663
664
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000665/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000666
667struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000668PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000669{
670 struct tok_state *tok = tok_new();
671 if (tok == NULL)
672 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000673 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000674 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675 return NULL;
676 }
677 tok->cur = tok->inp = tok->buf;
678 tok->end = tok->buf + BUFSIZ;
679 tok->fp = fp;
680 tok->prompt = ps1;
681 tok->nextprompt = ps2;
682 return tok;
683}
684
685
686/* Free a tok_state structure */
687
688void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000689PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000690{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000692 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000693#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000694 Py_XDECREF(tok->decoding_readline);
695 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000696#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000697 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000698 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000699 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000700}
701
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000702#if !defined(PGEN) && defined(Py_USING_UNICODE)
703static int
704tok_stdin_decode(struct tok_state *tok, char **inp)
705{
706 PyObject *enc, *sysstdin, *decoded, *utf8;
707 const char *encoding;
708 char *converted;
709
710 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
711 return 0;
712 sysstdin = PySys_GetObject("stdin");
713 if (sysstdin == NULL || !PyFile_Check(sysstdin))
714 return 0;
715
716 enc = ((PyFileObject *)sysstdin)->f_encoding;
717 if (enc == NULL || !PyString_Check(enc))
718 return 0;
719 Py_INCREF(enc);
720
721 encoding = PyString_AsString(enc);
722 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
723 if (decoded == NULL)
724 goto error_clear;
725
726 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
727 Py_DECREF(decoded);
728 if (utf8 == NULL)
729 goto error_clear;
730
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000731 assert(PyString_Check(utf8));
732 converted = new_string(PyString_AS_STRING(utf8),
733 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000734 Py_DECREF(utf8);
735 if (converted == NULL)
736 goto error_nomem;
737
Neal Norwitz08062d62006-04-11 08:19:15 +0000738 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000739 *inp = converted;
740 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000741 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000742 tok->encoding = new_string(encoding, strlen(encoding));
743 if (tok->encoding == NULL)
744 goto error_nomem;
745
746 Py_DECREF(enc);
747 return 0;
748
749error_nomem:
750 Py_DECREF(enc);
751 tok->done = E_NOMEM;
752 return -1;
753
754error_clear:
755 /* Fallback to iso-8859-1: for backward compatibility */
756 Py_DECREF(enc);
757 PyErr_Clear();
758 return 0;
759}
760#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761
762/* Get next char, updating state; error code goes into tok->done */
763
764static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000765tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000768 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000769 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000770 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000771 if (tok->done != E_OK)
772 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000774 char *end = strchr(tok->inp, '\n');
775 if (end != NULL)
776 end++;
777 else {
778 end = strchr(tok->inp, '\0');
779 if (end == tok->inp) {
780 tok->done = E_EOF;
781 return EOF;
782 }
783 }
784 if (tok->start == NULL)
785 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000786 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000787 tok->lineno++;
788 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000789 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000790 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000792 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793 if (tok->nextprompt != NULL)
794 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000795 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000796 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000797 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000798 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799 tok->done = E_EOF;
800 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000801#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000802 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000803 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000804#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000805 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000806 size_t start = tok->start - tok->buf;
807 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000808 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000809 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000810 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 tok->lineno++;
812 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000813 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000814 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000815 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000816 tok->done = E_NOMEM;
817 return EOF;
818 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->buf = buf;
820 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000821 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000822 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000823 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 tok->inp = tok->buf + newlen;
825 tok->end = tok->inp + 1;
826 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000827 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000828 else {
829 tok->lineno++;
830 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000831 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000832 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000833 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000835 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 tok->inp = strchr(tok->buf, '\0');
837 tok->end = tok->inp + 1;
838 }
839 }
840 else {
841 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000842 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000843 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 if (tok->start == NULL) {
845 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000846 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000847 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 if (tok->buf == NULL) {
849 tok->done = E_NOMEM;
850 return EOF;
851 }
852 tok->end = tok->buf + BUFSIZ;
853 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000854 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
855 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 tok->done = E_EOF;
857 done = 1;
858 }
859 else {
860 tok->done = E_OK;
861 tok->inp = strchr(tok->buf, '\0');
862 done = tok->inp[-1] == '\n';
863 }
864 }
865 else {
866 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000867 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000868 tok->done = E_EOF;
869 done = 1;
870 }
871 else
872 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000873 }
874 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000875 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000876 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000877 Py_ssize_t curstart = tok->start == NULL ? -1 :
878 tok->start - tok->buf;
879 Py_ssize_t curvalid = tok->inp - tok->buf;
880 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000881 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000882 newbuf = (char *)PyMem_REALLOC(newbuf,
883 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000884 if (newbuf == NULL) {
885 tok->done = E_NOMEM;
886 tok->cur = tok->inp;
887 return EOF;
888 }
889 tok->buf = newbuf;
890 tok->inp = tok->buf + curvalid;
891 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 tok->start = curstart < 0 ? NULL :
893 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000894 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000896 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000897 /* Break out early on decoding
898 errors, as tok->buf will be NULL
899 */
900 if (tok->decoding_erred)
901 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000902 /* Last line does not end in \n,
903 fake one */
904 strcpy(tok->inp, "\n");
905 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000906 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000907 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000908 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000909 if (tok->buf != NULL) {
910 tok->cur = tok->buf + cur;
911 tok->line_start = tok->cur;
912 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000913 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000914 pt = tok->inp - 2;
915 if (pt >= tok->buf && *pt == '\r') {
916 *pt++ = '\n';
917 *pt = '\0';
918 tok->inp = pt;
919 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000920 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000921 }
922 if (tok->done != E_OK) {
923 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000924 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000925 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000926 return EOF;
927 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000928 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000929 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000930}
931
932
933/* Back-up one character */
934
935static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000936tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000937{
938 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000939 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000940 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000941 if (*tok->cur != c)
942 *tok->cur = c;
943 }
944}
945
946
947/* Return the token corresponding to a single character */
948
949int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000950PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000951{
952 switch (c) {
953 case '(': return LPAR;
954 case ')': return RPAR;
955 case '[': return LSQB;
956 case ']': return RSQB;
957 case ':': return COLON;
958 case ',': return COMMA;
959 case ';': return SEMI;
960 case '+': return PLUS;
961 case '-': return MINUS;
962 case '*': return STAR;
963 case '/': return SLASH;
964 case '|': return VBAR;
965 case '&': return AMPER;
966 case '<': return LESS;
967 case '>': return GREATER;
968 case '=': return EQUAL;
969 case '.': return DOT;
970 case '%': return PERCENT;
971 case '`': return BACKQUOTE;
972 case '{': return LBRACE;
973 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000974 case '^': return CIRCUMFLEX;
975 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000976 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000977 default: return OP;
978 }
979}
980
981
Guido van Rossumfbab9051991-10-20 20:25:03 +0000982int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000983PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000984{
985 switch (c1) {
986 case '=':
987 switch (c2) {
988 case '=': return EQEQUAL;
989 }
990 break;
991 case '!':
992 switch (c2) {
993 case '=': return NOTEQUAL;
994 }
995 break;
996 case '<':
997 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +0000998 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000999 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001000 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001001 }
1002 break;
1003 case '>':
1004 switch (c2) {
1005 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001006 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001007 }
1008 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 case '+':
1010 switch (c2) {
1011 case '=': return PLUSEQUAL;
1012 }
1013 break;
1014 case '-':
1015 switch (c2) {
1016 case '=': return MINEQUAL;
1017 }
1018 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001019 case '*':
1020 switch (c2) {
1021 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001022 case '=': return STAREQUAL;
1023 }
1024 break;
1025 case '/':
1026 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001027 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001028 case '=': return SLASHEQUAL;
1029 }
1030 break;
1031 case '|':
1032 switch (c2) {
1033 case '=': return VBAREQUAL;
1034 }
1035 break;
1036 case '%':
1037 switch (c2) {
1038 case '=': return PERCENTEQUAL;
1039 }
1040 break;
1041 case '&':
1042 switch (c2) {
1043 case '=': return AMPEREQUAL;
1044 }
1045 break;
1046 case '^':
1047 switch (c2) {
1048 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001049 }
1050 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001051 }
1052 return OP;
1053}
1054
Thomas Wouters434d0822000-08-24 20:11:32 +00001055int
1056PyToken_ThreeChars(int c1, int c2, int c3)
1057{
1058 switch (c1) {
1059 case '<':
1060 switch (c2) {
1061 case '<':
1062 switch (c3) {
1063 case '=':
1064 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001065 }
1066 break;
1067 }
1068 break;
1069 case '>':
1070 switch (c2) {
1071 case '>':
1072 switch (c3) {
1073 case '=':
1074 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001075 }
1076 break;
1077 }
1078 break;
1079 case '*':
1080 switch (c2) {
1081 case '*':
1082 switch (c3) {
1083 case '=':
1084 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001085 }
1086 break;
1087 }
1088 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001089 case '/':
1090 switch (c2) {
1091 case '/':
1092 switch (c3) {
1093 case '=':
1094 return DOUBLESLASHEQUAL;
1095 }
1096 break;
1097 }
1098 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001099 }
1100 return OP;
1101}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001102
Guido van Rossum926f13a1998-04-09 21:38:06 +00001103static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001104indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105{
1106 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001107 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001108 tok->cur = tok->inp;
1109 return 1;
1110 }
1111 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001112 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1113 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001114 tok->altwarning = 0;
1115 }
1116 return 0;
1117}
1118
1119
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120/* Get next token, after space stripping etc. */
1121
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001122static int
1123tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001124{
1125 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001126 int blankline;
1127
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001128 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001129 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001130 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001131 blankline = 0;
1132
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001133 /* Get indentation level */
1134 if (tok->atbol) {
1135 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001136 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138 for (;;) {
1139 c = tok_nextc(tok);
1140 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001141 col++, altcol++;
1142 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001143 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001144 altcol = (altcol/tok->alttabsize + 1)
1145 * tok->alttabsize;
1146 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001147 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001148 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 else
1150 break;
1151 }
1152 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 if (c == '#' || c == '\n') {
1154 /* Lines with only whitespace and/or comments
1155 shouldn't affect the indentation and are
1156 not passed to the parser as NEWLINE tokens,
1157 except *totally* empty lines in interactive
1158 mode, which signal the end of a command group. */
1159 if (col == 0 && c == '\n' && tok->prompt != NULL)
1160 blankline = 0; /* Let it through */
1161 else
1162 blankline = 1; /* Ignore completely */
1163 /* We can't jump back right here since we still
1164 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001166 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001167 if (col == tok->indstack[tok->indent]) {
1168 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001169 if (altcol != tok->altindstack[tok->indent]) {
1170 if (indenterror(tok))
1171 return ERRORTOKEN;
1172 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001173 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174 else if (col > tok->indstack[tok->indent]) {
1175 /* Indent -- always one */
1176 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001177 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001178 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 return ERRORTOKEN;
1180 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001181 if (altcol <= tok->altindstack[tok->indent]) {
1182 if (indenterror(tok))
1183 return ERRORTOKEN;
1184 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001185 tok->pendin++;
1186 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001187 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001188 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001189 else /* col < tok->indstack[tok->indent] */ {
1190 /* Dedent -- any number, must be consistent */
1191 while (tok->indent > 0 &&
1192 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001193 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001194 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001195 }
1196 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001197 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001198 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001199 return ERRORTOKEN;
1200 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001201 if (altcol != tok->altindstack[tok->indent]) {
1202 if (indenterror(tok))
1203 return ERRORTOKEN;
1204 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 }
1206 }
1207 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001208
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001209 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001210
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 /* Return pending indents/dedents */
1212 if (tok->pendin != 0) {
1213 if (tok->pendin < 0) {
1214 tok->pendin++;
1215 return DEDENT;
1216 }
1217 else {
1218 tok->pendin--;
1219 return INDENT;
1220 }
1221 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001222
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001224 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001225 /* Skip spaces */
1226 do {
1227 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001228 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001229
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001230 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001231 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001232
Guido van Rossumab5ca152000-03-31 00:52:27 +00001233 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001235 static char *tabforms[] = {
1236 "tab-width:", /* Emacs */
1237 ":tabstop=", /* vim, full form */
1238 ":ts=", /* vim, abbreviated form */
1239 "set tabsize=", /* will vi never die? */
1240 /* more templates can be added here to support other editors */
1241 };
1242 char cbuf[80];
1243 char *tp, **cp;
1244 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001246 *tp++ = c = tok_nextc(tok);
1247 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001248 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001249 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001250 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001251 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1252 cp++) {
1253 if ((tp = strstr(cbuf, *cp))) {
1254 int newsize = atoi(tp + strlen(*cp));
1255
1256 if (newsize >= 1 && newsize <= 40) {
1257 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001258 if (Py_VerboseFlag)
1259 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001260 "Tab size set to %d\n",
1261 newsize);
1262 }
1263 }
1264 }
1265 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001268
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001270 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001272 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001273
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001274 /* Identifier (most frequent token!) */
1275 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001276 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001277 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001278 case 'b':
1279 case 'B':
1280 c = tok_nextc(tok);
1281 if (c == 'r' || c == 'R')
1282 c = tok_nextc(tok);
1283 if (c == '"' || c == '\'')
1284 goto letter_quote;
1285 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001286 case 'r':
1287 case 'R':
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001291 break;
1292 case 'u':
1293 case 'U':
1294 c = tok_nextc(tok);
1295 if (c == 'r' || c == 'R')
1296 c = tok_nextc(tok);
1297 if (c == '"' || c == '\'')
1298 goto letter_quote;
1299 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001300 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001301 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001303 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001305 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 *p_end = tok->cur;
1307 return NAME;
1308 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001309
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 /* Newline */
1311 if (c == '\n') {
1312 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001313 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001314 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001315 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001317 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 return NEWLINE;
1319 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001320
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001321 /* Period or number starting with period? */
1322 if (c == '.') {
1323 c = tok_nextc(tok);
1324 if (isdigit(c)) {
1325 goto fraction;
1326 }
1327 else {
1328 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001329 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001330 *p_end = tok->cur;
1331 return DOT;
1332 }
1333 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001334
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 /* Number */
1336 if (isdigit(c)) {
1337 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001338 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339 c = tok_nextc(tok);
1340 if (c == '.')
1341 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001342#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001343 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001344 goto imaginary;
1345#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001347
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001349 c = tok_nextc(tok);
1350 if (!isxdigit(c)) {
1351 tok->done = E_TOKEN;
1352 tok_backup(tok, c);
1353 return ERRORTOKEN;
1354 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355 do {
1356 c = tok_nextc(tok);
1357 } while (isxdigit(c));
1358 }
1359 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001360 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001361 /* Octal; c is first char of it */
1362 /* There's no 'isoctdigit' macro, sigh */
1363 while ('0' <= c && c < '8') {
1364 c = tok_nextc(tok);
1365 }
Tim Petersd507dab2001-08-30 20:51:59 +00001366 if (isdigit(c)) {
1367 found_decimal = 1;
1368 do {
1369 c = tok_nextc(tok);
1370 } while (isdigit(c));
1371 }
1372 if (c == '.')
1373 goto fraction;
1374 else if (c == 'e' || c == 'E')
1375 goto exponent;
1376#ifndef WITHOUT_COMPLEX
1377 else if (c == 'j' || c == 'J')
1378 goto imaginary;
1379#endif
1380 else if (found_decimal) {
1381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
1384 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001386 if (c == 'l' || c == 'L')
1387 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 }
1389 else {
1390 /* Decimal */
1391 do {
1392 c = tok_nextc(tok);
1393 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001394 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001396 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001397 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001398 if (c == '.') {
1399 fraction:
1400 /* Fraction */
1401 do {
1402 c = tok_nextc(tok);
1403 } while (isdigit(c));
1404 }
1405 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001406 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001407 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001409 if (c == '+' || c == '-')
1410 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001411 if (!isdigit(c)) {
1412 tok->done = E_TOKEN;
1413 tok_backup(tok, c);
1414 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001415 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001416 do {
1417 c = tok_nextc(tok);
1418 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001420#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001421 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001422 /* Imaginary part */
1423 imaginary:
1424 c = tok_nextc(tok);
1425#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426 }
1427 }
1428 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 *p_end = tok->cur;
1431 return NUMBER;
1432 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001433
1434 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001435 /* String */
1436 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001437 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001438 int quote = c;
1439 int triple = 0;
1440 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 for (;;) {
1442 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001443 if (c == '\n') {
1444 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001445 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001446 tok_backup(tok, c);
1447 return ERRORTOKEN;
1448 }
1449 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001450 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001451 }
1452 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001453 if (triple)
1454 tok->done = E_EOFS;
1455 else
1456 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001457 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001458 return ERRORTOKEN;
1459 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 else if (c == quote) {
1461 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001462 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001463 c = tok_nextc(tok);
1464 if (c == quote) {
1465 triple = 1;
1466 tripcount = 0;
1467 continue;
1468 }
1469 tok_backup(tok, c);
1470 }
1471 if (!triple || tripcount == 3)
1472 break;
1473 }
1474 else if (c == '\\') {
1475 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001477 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001478 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001479 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001480 return ERRORTOKEN;
1481 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001483 else
1484 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001485 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001486 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001487 *p_end = tok->cur;
1488 return STRING;
1489 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001490
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001491 /* Line continuation */
1492 if (c == '\\') {
1493 c = tok_nextc(tok);
1494 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001495 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001496 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001497 return ERRORTOKEN;
1498 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001499 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001500 goto again; /* Read next line */
1501 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001502
Guido van Rossumfbab9051991-10-20 20:25:03 +00001503 /* Check for two-character token */
1504 {
1505 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001506 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001507#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001508 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001509 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1510 "<> not supported in 3.x",
1511 tok->filename, tok->lineno,
1512 NULL, NULL)) {
1513 return ERRORTOKEN;
1514 }
1515 }
1516#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001517 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001518 int c3 = tok_nextc(tok);
1519 int token3 = PyToken_ThreeChars(c, c2, c3);
1520 if (token3 != OP) {
1521 token = token3;
1522 } else {
1523 tok_backup(tok, c3);
1524 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001525 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001526 *p_end = tok->cur;
1527 return token;
1528 }
1529 tok_backup(tok, c2);
1530 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001531
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001532 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001533 switch (c) {
1534 case '(':
1535 case '[':
1536 case '{':
1537 tok->level++;
1538 break;
1539 case ')':
1540 case ']':
1541 case '}':
1542 tok->level--;
1543 break;
1544 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001545
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001546 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001547 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001548 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001549 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001550}
1551
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001552int
1553PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1554{
1555 int result = tok_get(tok, p_start, p_end);
1556 if (tok->decoding_erred) {
1557 result = ERRORTOKEN;
1558 tok->done = E_DECODE;
1559 }
1560 return result;
1561}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001562
Martin v. Löwisa5136192007-09-04 14:19:28 +00001563/* This function is only called from parsetok. However, it cannot live
1564 there, as it must be empty for PGEN, and we can check for PGEN only
1565 in this file. */
1566
Christian Heimes082c9b02008-01-23 14:20:50 +00001567#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001568char*
1569PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1570{
1571 return NULL;
1572}
1573#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001574#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001575static PyObject *
1576dec_utf8(const char *enc, const char *text, size_t len) {
1577 PyObject *ret = NULL;
1578 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1579 if (unicode_text) {
1580 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1581 Py_DECREF(unicode_text);
1582 }
1583 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001584 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001585 }
1586 return ret;
1587}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001588char *
1589PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1590{
1591 char *text = NULL;
1592 if (tok->encoding) {
1593 /* convert source to original encondig */
1594 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1595 if (lineobj != NULL) {
1596 int linelen = PyString_Size(lineobj);
1597 const char *line = PyString_AsString(lineobj);
1598 text = PyObject_MALLOC(linelen + 1);
1599 if (text != NULL && line != NULL) {
1600 if (linelen)
1601 strncpy(text, line, linelen);
1602 text[linelen] = '\0';
1603 }
1604 Py_DECREF(lineobj);
1605
1606 /* adjust error offset */
1607 if (*offset > 1) {
1608 PyObject *offsetobj = dec_utf8(tok->encoding,
1609 tok->buf, *offset-1);
1610 if (offsetobj) {
1611 *offset = PyString_Size(offsetobj) + 1;
1612 Py_DECREF(offsetobj);
1613 }
1614 }
1615
1616 }
1617 }
1618 return text;
1619
1620}
Georg Brandl76b30d12008-01-07 18:41:34 +00001621#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001622#endif
1623
Martin v. Löwisa5136192007-09-04 14:19:28 +00001624
Guido van Rossum408027e1996-12-30 16:17:54 +00001625#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001626
1627void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001628tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001629{
Guido van Rossum86bea461997-04-29 21:03:06 +00001630 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001631 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1632 printf("(%.*s)", (int)(end - start), start);
1633}
1634
1635#endif