blob: 1d0a4aa3f23b438dc5c17185bbd973130582f436 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000160 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000202get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000204 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
223
224 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000225 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000226 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000227 t++;
228
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000233 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000234 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 int set_readline(struct tok_state *, const char *))
251{
Tim Peters17db21f2002-09-03 15:39:58 +0000252 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000254
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000258 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000267#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000273 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000274 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000330 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000331 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000341 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000342
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000351 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000358#ifndef Py_USING_UNICODE
359 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000360 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000361 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000362#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000366 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367
368 /* Ask for one less byte so we can terminate it */
369 assert(size > 0);
370 size--;
371
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000374 if (buf == NULL)
375 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000376 } else {
377 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 if (PyString_CheckExact(buf))
379 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 if (utf8 == NULL) {
382 utf8 = PyUnicode_AsUTF8String(buf);
383 Py_DECREF(buf);
384 if (utf8 == NULL)
385 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 str = PyString_AsString(utf8);
388 utf8len = PyString_GET_SIZE(utf8);
389 if (utf8len > size) {
390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
391 if (tok->decoding_buffer == NULL) {
392 Py_DECREF(utf8);
393 return error_ret(tok);
394 }
395 utf8len = size;
396 }
397 memcpy(s, str, utf8len);
398 s[utf8len] = '\0';
399 Py_DECREF(utf8);
400 if (utf8len == 0) return NULL; /* EOF */
401 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000402#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000403}
404
405/* Set the readline function for TOK to a StreamReader's
406 readline function. The StreamReader is named ENC.
407
408 This function is called from check_bom and check_coding_spec.
409
410 ENC is usually identical to the future value of tok->encoding,
411 except for the (currently unsupported) case of UTF-16.
412
413 Return 1 on success, 0 on failure. */
414
415static int
416fp_setreadl(struct tok_state *tok, const char* enc)
417{
418 PyObject *reader, *stream, *readline;
419
Martin v. Löwis95292d62002-12-11 14:04:59 +0000420 /* XXX: constify filename argument. */
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000422 if (stream == NULL)
423 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
425 reader = PyCodec_StreamReader(enc, stream, NULL);
426 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000427 if (reader == NULL)
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429
430 readline = PyObject_GetAttrString(reader, "readline");
431 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000432 if (readline == NULL)
433 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000434
435 tok->decoding_readline = readline;
436 return 1;
437}
438
439/* Fetch the next byte from TOK. */
440
441static int fp_getc(struct tok_state *tok) {
442 return getc(tok->fp);
443}
444
445/* Unfetch the last byte back into TOK. */
446
447static void fp_ungetc(int c, struct tok_state *tok) {
448 ungetc(c, tok->fp);
449}
450
451/* Read a line of input from TOK. Determine encoding
452 if necessary. */
453
454static char *
455decoding_fgets(char *s, int size, struct tok_state *tok)
456{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000457 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000458 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000459 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460 if (tok->decoding_state < 0) {
461 /* We already have a codec associated with
462 this input. */
463 line = fp_readl(s, size, tok);
464 break;
465 } else if (tok->decoding_state > 0) {
466 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000467 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469 break;
470 } else {
471 /* We have not yet determined the encoding.
472 If an encoding is found, use the file-pointer
473 reader functions from now on. */
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475 return error_ret(tok);
476 assert(tok->decoding_state != 0);
477 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000478 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481 return error_ret(tok);
482 }
483 }
484#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000485 /* The default encoding is ASCII, so make sure we don't have any
486 non-ASCII bytes in it. */
487 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000489 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000490 if (*c > 127) {
491 badchar = *c;
492 break;
493 }
494 }
495 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000496 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000499 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000503 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000504 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000505 PyErr_SetString(PyExc_SyntaxError, buf);
506 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 }
508#endif
509 return line;
510}
511
512static int
513decoding_feof(struct tok_state *tok)
514{
515 if (tok->decoding_state >= 0) {
516 return feof(tok->fp);
517 } else {
518 PyObject* buf = tok->decoding_buffer;
519 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000520 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521 if (buf == NULL) {
522 error_ret(tok);
523 return 1;
524 } else {
525 tok->decoding_buffer = buf;
526 }
527 }
528 return PyObject_Length(buf) == 0;
529 }
530}
531
532/* Fetch a byte from TOK, using the string buffer. */
533
Tim Petersc9d78aa2006-03-26 23:27:58 +0000534static int
535buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000536 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537}
538
539/* Unfetch a byte from TOK, using the string buffer. */
540
Tim Petersc9d78aa2006-03-26 23:27:58 +0000541static void
542buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Set the readline function for TOK to ENC. For the string-based
548 tokenizer, this means to just record the encoding. */
549
Tim Petersc9d78aa2006-03-26 23:27:58 +0000550static int
551buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 tok->enc = enc;
553 return 1;
554}
555
556/* Return a UTF-8 encoding Python string object from the
557 C byte string STR, which is encoded with ENC. */
558
Martin v. Löwis019934b2002-08-07 12:33:18 +0000559#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560static PyObject *
561translate_into_utf8(const char* str, const char* enc) {
562 PyObject *utf8;
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564 if (buf == NULL)
565 return NULL;
566 utf8 = PyUnicode_AsUTF8String(buf);
567 Py_DECREF(buf);
568 return utf8;
569}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000570#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571
572/* Decode a byte string STR for use as the buffer of TOK.
573 Look for encoding declarations inside STR, and record them
574 inside TOK. */
575
576static const char *
577decode_str(const char *str, struct tok_state *tok)
578{
579 PyObject* utf8 = NULL;
580 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000581 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000586 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000588 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000589#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 str = PyString_AsString(utf8);
595 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000600 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000601 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 lineno++;
603 if (lineno == 2) break;
604 }
605 }
606 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000607 /* need to check line 1 and 2 separately since check_coding_spec
608 assumes a single line as input */
609 if (newl[0]) {
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611 return error_ret(tok);
612 if (tok->enc == NULL && newl[1]) {
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614 tok, buf_setreadl))
615 return error_ret(tok);
616 }
617 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000618#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 if (tok->enc != NULL) {
620 assert(utf8 == NULL);
621 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000622 if (utf8 == NULL) {
623 PyErr_Format(PyExc_SyntaxError,
624 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000625 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000626 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627 str = PyString_AsString(utf8);
628 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000629#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630 assert(tok->decoding_buffer == NULL);
631 tok->decoding_buffer = utf8; /* CAUTION */
632 return str;
633}
634
635#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636
637/* Set up tokenizer for string */
638
639struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000640PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000641{
642 struct tok_state *tok = tok_new();
643 if (tok == NULL)
644 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000646 if (str == NULL) {
647 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000649 }
650
Martin v. Löwis95292d62002-12-11 14:04:59 +0000651 /* XXX: constify members. */
652 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653 return tok;
654}
655
656
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000657/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658
659struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000660PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661{
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000665 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 return NULL;
668 }
669 tok->cur = tok->inp = tok->buf;
670 tok->end = tok->buf + BUFSIZ;
671 tok->fp = fp;
672 tok->prompt = ps1;
673 tok->nextprompt = ps2;
674 return tok;
675}
676
677
678/* Free a tok_state structure */
679
680void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000681PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000684 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000685#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000686 Py_XDECREF(tok->decoding_readline);
687 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000688#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000689 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000690 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000691 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000692}
693
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000694#if !defined(PGEN) && defined(Py_USING_UNICODE)
695static int
696tok_stdin_decode(struct tok_state *tok, char **inp)
697{
698 PyObject *enc, *sysstdin, *decoded, *utf8;
699 const char *encoding;
700 char *converted;
701
702 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
703 return 0;
704 sysstdin = PySys_GetObject("stdin");
705 if (sysstdin == NULL || !PyFile_Check(sysstdin))
706 return 0;
707
708 enc = ((PyFileObject *)sysstdin)->f_encoding;
709 if (enc == NULL || !PyString_Check(enc))
710 return 0;
711 Py_INCREF(enc);
712
713 encoding = PyString_AsString(enc);
714 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
715 if (decoded == NULL)
716 goto error_clear;
717
718 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
719 Py_DECREF(decoded);
720 if (utf8 == NULL)
721 goto error_clear;
722
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000723 assert(PyString_Check(utf8));
724 converted = new_string(PyString_AS_STRING(utf8),
725 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000726 Py_DECREF(utf8);
727 if (converted == NULL)
728 goto error_nomem;
729
Neal Norwitz08062d62006-04-11 08:19:15 +0000730 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000731 *inp = converted;
732 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000733 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000734 tok->encoding = new_string(encoding, strlen(encoding));
735 if (tok->encoding == NULL)
736 goto error_nomem;
737
738 Py_DECREF(enc);
739 return 0;
740
741error_nomem:
742 Py_DECREF(enc);
743 tok->done = E_NOMEM;
744 return -1;
745
746error_clear:
747 /* Fallback to iso-8859-1: for backward compatibility */
748 Py_DECREF(enc);
749 PyErr_Clear();
750 return 0;
751}
752#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753
754/* Get next char, updating state; error code goes into tok->done */
755
756static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000757tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000760 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000761 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000762 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000763 if (tok->done != E_OK)
764 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000766 char *end = strchr(tok->inp, '\n');
767 if (end != NULL)
768 end++;
769 else {
770 end = strchr(tok->inp, '\0');
771 if (end == tok->inp) {
772 tok->done = E_EOF;
773 return EOF;
774 }
775 }
776 if (tok->start == NULL)
777 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000778 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000779 tok->lineno++;
780 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000781 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000782 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000784 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785 if (tok->nextprompt != NULL)
786 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000787 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000788 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000789 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000790 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 tok->done = E_EOF;
792 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000793#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000794 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000795 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000796#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000797 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000798 size_t start = tok->start - tok->buf;
799 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000800 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000801 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000802 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 tok->lineno++;
804 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000805 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000806 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000807 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000808 tok->done = E_NOMEM;
809 return EOF;
810 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 tok->buf = buf;
812 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000813 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000814 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000815 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 tok->inp = tok->buf + newlen;
817 tok->end = tok->inp + 1;
818 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000819 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000820 else {
821 tok->lineno++;
822 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000823 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000824 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000825 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000826 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000827 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000828 tok->inp = strchr(tok->buf, '\0');
829 tok->end = tok->inp + 1;
830 }
831 }
832 else {
833 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000834 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000835 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 if (tok->start == NULL) {
837 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000838 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000839 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 if (tok->buf == NULL) {
841 tok->done = E_NOMEM;
842 return EOF;
843 }
844 tok->end = tok->buf + BUFSIZ;
845 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000846 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
847 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 tok->done = E_EOF;
849 done = 1;
850 }
851 else {
852 tok->done = E_OK;
853 tok->inp = strchr(tok->buf, '\0');
854 done = tok->inp[-1] == '\n';
855 }
856 }
857 else {
858 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000859 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000860 tok->done = E_EOF;
861 done = 1;
862 }
863 else
864 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 }
866 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000868 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000869 Py_ssize_t curstart = tok->start == NULL ? -1 :
870 tok->start - tok->buf;
871 Py_ssize_t curvalid = tok->inp - tok->buf;
872 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000874 newbuf = (char *)PyMem_REALLOC(newbuf,
875 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 if (newbuf == NULL) {
877 tok->done = E_NOMEM;
878 tok->cur = tok->inp;
879 return EOF;
880 }
881 tok->buf = newbuf;
882 tok->inp = tok->buf + curvalid;
883 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000884 tok->start = curstart < 0 ? NULL :
885 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000886 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000888 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000889 /* Break out early on decoding
890 errors, as tok->buf will be NULL
891 */
892 if (tok->decoding_erred)
893 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000894 /* Last line does not end in \n,
895 fake one */
896 strcpy(tok->inp, "\n");
897 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000898 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000899 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000901 if (tok->buf != NULL) {
902 tok->cur = tok->buf + cur;
903 tok->line_start = tok->cur;
904 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000905 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000906 pt = tok->inp - 2;
907 if (pt >= tok->buf && *pt == '\r') {
908 *pt++ = '\n';
909 *pt = '\0';
910 tok->inp = pt;
911 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000912 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913 }
914 if (tok->done != E_OK) {
915 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000916 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000917 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918 return EOF;
919 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000920 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000921 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000922}
923
924
925/* Back-up one character */
926
927static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000928tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000929{
930 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000931 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000932 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000933 if (*tok->cur != c)
934 *tok->cur = c;
935 }
936}
937
938
939/* Return the token corresponding to a single character */
940
941int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000942PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000943{
944 switch (c) {
945 case '(': return LPAR;
946 case ')': return RPAR;
947 case '[': return LSQB;
948 case ']': return RSQB;
949 case ':': return COLON;
950 case ',': return COMMA;
951 case ';': return SEMI;
952 case '+': return PLUS;
953 case '-': return MINUS;
954 case '*': return STAR;
955 case '/': return SLASH;
956 case '|': return VBAR;
957 case '&': return AMPER;
958 case '<': return LESS;
959 case '>': return GREATER;
960 case '=': return EQUAL;
961 case '.': return DOT;
962 case '%': return PERCENT;
963 case '`': return BACKQUOTE;
964 case '{': return LBRACE;
965 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000966 case '^': return CIRCUMFLEX;
967 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000968 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000969 default: return OP;
970 }
971}
972
973
Guido van Rossumfbab9051991-10-20 20:25:03 +0000974int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000975PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000976{
977 switch (c1) {
978 case '=':
979 switch (c2) {
980 case '=': return EQEQUAL;
981 }
982 break;
983 case '!':
984 switch (c2) {
985 case '=': return NOTEQUAL;
986 }
987 break;
988 case '<':
989 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +0000990 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000991 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000992 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000993 }
994 break;
995 case '>':
996 switch (c2) {
997 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000998 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000999 }
1000 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001001 case '+':
1002 switch (c2) {
1003 case '=': return PLUSEQUAL;
1004 }
1005 break;
1006 case '-':
1007 switch (c2) {
1008 case '=': return MINEQUAL;
1009 }
1010 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001011 case '*':
1012 switch (c2) {
1013 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001014 case '=': return STAREQUAL;
1015 }
1016 break;
1017 case '/':
1018 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001019 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001020 case '=': return SLASHEQUAL;
1021 }
1022 break;
1023 case '|':
1024 switch (c2) {
1025 case '=': return VBAREQUAL;
1026 }
1027 break;
1028 case '%':
1029 switch (c2) {
1030 case '=': return PERCENTEQUAL;
1031 }
1032 break;
1033 case '&':
1034 switch (c2) {
1035 case '=': return AMPEREQUAL;
1036 }
1037 break;
1038 case '^':
1039 switch (c2) {
1040 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001041 }
1042 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001043 }
1044 return OP;
1045}
1046
Thomas Wouters434d0822000-08-24 20:11:32 +00001047int
1048PyToken_ThreeChars(int c1, int c2, int c3)
1049{
1050 switch (c1) {
1051 case '<':
1052 switch (c2) {
1053 case '<':
1054 switch (c3) {
1055 case '=':
1056 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001057 }
1058 break;
1059 }
1060 break;
1061 case '>':
1062 switch (c2) {
1063 case '>':
1064 switch (c3) {
1065 case '=':
1066 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001067 }
1068 break;
1069 }
1070 break;
1071 case '*':
1072 switch (c2) {
1073 case '*':
1074 switch (c3) {
1075 case '=':
1076 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 }
1078 break;
1079 }
1080 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001081 case '/':
1082 switch (c2) {
1083 case '/':
1084 switch (c3) {
1085 case '=':
1086 return DOUBLESLASHEQUAL;
1087 }
1088 break;
1089 }
1090 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001091 }
1092 return OP;
1093}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001094
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001096indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001097{
1098 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001099 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001100 tok->cur = tok->inp;
1101 return 1;
1102 }
1103 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001106 tok->altwarning = 0;
1107 }
1108 return 0;
1109}
1110
1111
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112/* Get next token, after space stripping etc. */
1113
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001114static int
1115tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116{
1117 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 int blankline;
1119
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001120 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001122 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001123 blankline = 0;
1124
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001125 /* Get indentation level */
1126 if (tok->atbol) {
1127 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001128 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001129 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130 for (;;) {
1131 c = tok_nextc(tok);
1132 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133 col++, altcol++;
1134 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001136 altcol = (altcol/tok->alttabsize + 1)
1137 * tok->alttabsize;
1138 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001139 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001140 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141 else
1142 break;
1143 }
1144 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001145 if (c == '#' || c == '\n') {
1146 /* Lines with only whitespace and/or comments
1147 shouldn't affect the indentation and are
1148 not passed to the parser as NEWLINE tokens,
1149 except *totally* empty lines in interactive
1150 mode, which signal the end of a command group. */
1151 if (col == 0 && c == '\n' && tok->prompt != NULL)
1152 blankline = 0; /* Let it through */
1153 else
1154 blankline = 1; /* Ignore completely */
1155 /* We can't jump back right here since we still
1156 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001157 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001158 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001159 if (col == tok->indstack[tok->indent]) {
1160 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001161 if (altcol != tok->altindstack[tok->indent]) {
1162 if (indenterror(tok))
1163 return ERRORTOKEN;
1164 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001166 else if (col > tok->indstack[tok->indent]) {
1167 /* Indent -- always one */
1168 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001169 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001170 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001171 return ERRORTOKEN;
1172 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001173 if (altcol <= tok->altindstack[tok->indent]) {
1174 if (indenterror(tok))
1175 return ERRORTOKEN;
1176 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 tok->pendin++;
1178 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001179 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001181 else /* col < tok->indstack[tok->indent] */ {
1182 /* Dedent -- any number, must be consistent */
1183 while (tok->indent > 0 &&
1184 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001185 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001186 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001187 }
1188 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001189 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001190 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001191 return ERRORTOKEN;
1192 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001193 if (altcol != tok->altindstack[tok->indent]) {
1194 if (indenterror(tok))
1195 return ERRORTOKEN;
1196 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197 }
1198 }
1199 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001200
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001201 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001202
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001203 /* Return pending indents/dedents */
1204 if (tok->pendin != 0) {
1205 if (tok->pendin < 0) {
1206 tok->pendin++;
1207 return DEDENT;
1208 }
1209 else {
1210 tok->pendin--;
1211 return INDENT;
1212 }
1213 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001214
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001216 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 /* Skip spaces */
1218 do {
1219 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001220 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001221
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001223 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001224
Guido van Rossumab5ca152000-03-31 00:52:27 +00001225 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001226 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001227 static char *tabforms[] = {
1228 "tab-width:", /* Emacs */
1229 ":tabstop=", /* vim, full form */
1230 ":ts=", /* vim, abbreviated form */
1231 "set tabsize=", /* will vi never die? */
1232 /* more templates can be added here to support other editors */
1233 };
1234 char cbuf[80];
1235 char *tp, **cp;
1236 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001238 *tp++ = c = tok_nextc(tok);
1239 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001240 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001241 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001242 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001243 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1244 cp++) {
1245 if ((tp = strstr(cbuf, *cp))) {
1246 int newsize = atoi(tp + strlen(*cp));
1247
1248 if (newsize >= 1 && newsize <= 40) {
1249 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001250 if (Py_VerboseFlag)
1251 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001252 "Tab size set to %d\n",
1253 newsize);
1254 }
1255 }
1256 }
1257 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001260
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001262 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001264 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001265
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 /* Identifier (most frequent token!) */
1267 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001268 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001269 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001270 case 'b':
1271 case 'B':
1272 c = tok_nextc(tok);
1273 if (c == 'r' || c == 'R')
1274 c = tok_nextc(tok);
1275 if (c == '"' || c == '\'')
1276 goto letter_quote;
1277 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001278 case 'r':
1279 case 'R':
1280 c = tok_nextc(tok);
1281 if (c == '"' || c == '\'')
1282 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001283 break;
1284 case 'u':
1285 case 'U':
1286 c = tok_nextc(tok);
1287 if (c == 'r' || c == 'R')
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001292 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001293 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001295 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001297 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 *p_end = tok->cur;
1299 return NAME;
1300 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001301
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 /* Newline */
1303 if (c == '\n') {
1304 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001305 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001306 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001307 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001309 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 return NEWLINE;
1311 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001312
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001313 /* Period or number starting with period? */
1314 if (c == '.') {
1315 c = tok_nextc(tok);
1316 if (isdigit(c)) {
1317 goto fraction;
1318 }
1319 else {
1320 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001321 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001322 *p_end = tok->cur;
1323 return DOT;
1324 }
1325 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001326
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 /* Number */
1328 if (isdigit(c)) {
1329 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001330 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001331 c = tok_nextc(tok);
1332 if (c == '.')
1333 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001334#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001335 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001336 goto imaginary;
1337#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001339
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001340 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001341 c = tok_nextc(tok);
1342 if (!isxdigit(c)) {
1343 tok->done = E_TOKEN;
1344 tok_backup(tok, c);
1345 return ERRORTOKEN;
1346 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 do {
1348 c = tok_nextc(tok);
1349 } while (isxdigit(c));
1350 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001351 else if (c == 'o' || c == 'O') {
1352 /* Octal */
1353 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001354 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001355 tok->done = E_TOKEN;
1356 tok_backup(tok, c);
1357 return ERRORTOKEN;
1358 }
1359 do {
1360 c = tok_nextc(tok);
1361 } while ('0' <= c && c < '8');
1362 }
1363 else if (c == 'b' || c == 'B') {
1364 /* Binary */
1365 c = tok_nextc(tok);
1366 if (c != '0' && c != '1') {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
1371 do {
1372 c = tok_nextc(tok);
1373 } while (c == '0' || c == '1');
1374 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001376 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001377 /* Octal; c is first char of it */
1378 /* There's no 'isoctdigit' macro, sigh */
1379 while ('0' <= c && c < '8') {
1380 c = tok_nextc(tok);
1381 }
Tim Petersd507dab2001-08-30 20:51:59 +00001382 if (isdigit(c)) {
1383 found_decimal = 1;
1384 do {
1385 c = tok_nextc(tok);
1386 } while (isdigit(c));
1387 }
1388 if (c == '.')
1389 goto fraction;
1390 else if (c == 'e' || c == 'E')
1391 goto exponent;
1392#ifndef WITHOUT_COMPLEX
1393 else if (c == 'j' || c == 'J')
1394 goto imaginary;
1395#endif
1396 else if (found_decimal) {
1397 tok->done = E_TOKEN;
1398 tok_backup(tok, c);
1399 return ERRORTOKEN;
1400 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001402 if (c == 'l' || c == 'L')
1403 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 }
1405 else {
1406 /* Decimal */
1407 do {
1408 c = tok_nextc(tok);
1409 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001410 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001412 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001413 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001414 if (c == '.') {
1415 fraction:
1416 /* Fraction */
1417 do {
1418 c = tok_nextc(tok);
1419 } while (isdigit(c));
1420 }
1421 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001422 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001423 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001425 if (c == '+' || c == '-')
1426 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001427 if (!isdigit(c)) {
1428 tok->done = E_TOKEN;
1429 tok_backup(tok, c);
1430 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001431 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001432 do {
1433 c = tok_nextc(tok);
1434 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001436#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001437 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001438 /* Imaginary part */
1439 imaginary:
1440 c = tok_nextc(tok);
1441#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001442 }
1443 }
1444 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001445 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001446 *p_end = tok->cur;
1447 return NUMBER;
1448 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001449
1450 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001451 /* String */
1452 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001453 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001454 int quote = c;
1455 int triple = 0;
1456 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001457 for (;;) {
1458 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001459 if (c == '\n') {
1460 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001461 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
1465 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001466 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001467 }
1468 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001469 if (triple)
1470 tok->done = E_EOFS;
1471 else
1472 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001473 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 return ERRORTOKEN;
1475 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001476 else if (c == quote) {
1477 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001478 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001479 c = tok_nextc(tok);
1480 if (c == quote) {
1481 triple = 1;
1482 tripcount = 0;
1483 continue;
1484 }
1485 tok_backup(tok, c);
1486 }
1487 if (!triple || tripcount == 3)
1488 break;
1489 }
1490 else if (c == '\\') {
1491 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001493 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001494 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001495 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001496 return ERRORTOKEN;
1497 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001499 else
1500 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001501 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001502 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001503 *p_end = tok->cur;
1504 return STRING;
1505 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001506
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001507 /* Line continuation */
1508 if (c == '\\') {
1509 c = tok_nextc(tok);
1510 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001511 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001512 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513 return ERRORTOKEN;
1514 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001515 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516 goto again; /* Read next line */
1517 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001518
Guido van Rossumfbab9051991-10-20 20:25:03 +00001519 /* Check for two-character token */
1520 {
1521 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001522 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001523#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001524 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001525 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001526 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001527 tok->filename, tok->lineno,
1528 NULL, NULL)) {
1529 return ERRORTOKEN;
1530 }
1531 }
1532#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001533 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001534 int c3 = tok_nextc(tok);
1535 int token3 = PyToken_ThreeChars(c, c2, c3);
1536 if (token3 != OP) {
1537 token = token3;
1538 } else {
1539 tok_backup(tok, c3);
1540 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001541 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001542 *p_end = tok->cur;
1543 return token;
1544 }
1545 tok_backup(tok, c2);
1546 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001547
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001548 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001549 switch (c) {
1550 case '(':
1551 case '[':
1552 case '{':
1553 tok->level++;
1554 break;
1555 case ')':
1556 case ']':
1557 case '}':
1558 tok->level--;
1559 break;
1560 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001561
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001562 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001563 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001564 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001565 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001566}
1567
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001568int
1569PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1570{
1571 int result = tok_get(tok, p_start, p_end);
1572 if (tok->decoding_erred) {
1573 result = ERRORTOKEN;
1574 tok->done = E_DECODE;
1575 }
1576 return result;
1577}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001578
Martin v. Löwisa5136192007-09-04 14:19:28 +00001579/* This function is only called from parsetok. However, it cannot live
1580 there, as it must be empty for PGEN, and we can check for PGEN only
1581 in this file. */
1582
Christian Heimes082c9b02008-01-23 14:20:50 +00001583#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001584char*
1585PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1586{
1587 return NULL;
1588}
1589#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001590#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001591static PyObject *
1592dec_utf8(const char *enc, const char *text, size_t len) {
1593 PyObject *ret = NULL;
1594 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1595 if (unicode_text) {
1596 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1597 Py_DECREF(unicode_text);
1598 }
1599 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001600 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001601 }
1602 return ret;
1603}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001604char *
1605PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1606{
1607 char *text = NULL;
1608 if (tok->encoding) {
1609 /* convert source to original encondig */
1610 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1611 if (lineobj != NULL) {
1612 int linelen = PyString_Size(lineobj);
1613 const char *line = PyString_AsString(lineobj);
1614 text = PyObject_MALLOC(linelen + 1);
1615 if (text != NULL && line != NULL) {
1616 if (linelen)
1617 strncpy(text, line, linelen);
1618 text[linelen] = '\0';
1619 }
1620 Py_DECREF(lineobj);
1621
1622 /* adjust error offset */
1623 if (*offset > 1) {
1624 PyObject *offsetobj = dec_utf8(tok->encoding,
1625 tok->buf, *offset-1);
1626 if (offsetobj) {
1627 *offset = PyString_Size(offsetobj) + 1;
1628 Py_DECREF(offsetobj);
1629 }
1630 }
1631
1632 }
1633 }
1634 return text;
1635
1636}
Georg Brandl76b30d12008-01-07 18:41:34 +00001637#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001638#endif
1639
Martin v. Löwisa5136192007-09-04 14:19:28 +00001640
Guido van Rossum408027e1996-12-30 16:17:54 +00001641#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001642
1643void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001644tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001645{
Guido van Rossum86bea461997-04-29 21:03:06 +00001646 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001647 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1648 printf("(%.*s)", (int)(end - start), start);
1649}
1650
1651#endif