blob: 0f6705de0b5559f57d13fce6b6a8d9fc10000655 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000160 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000166new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167{
Neal Norwitz08062d62006-04-11 08:19:15 +0000168 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000202get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000204 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
223
224 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000225 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000226 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000227 t++;
228
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000233 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000234 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 int set_readline(struct tok_state *, const char *))
251{
Tim Peters17db21f2002-09-03 15:39:58 +0000252 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000254
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000258 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000267#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000273 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000274 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000330 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000331 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000341 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000342
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000347 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000351 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000358#ifndef Py_USING_UNICODE
359 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000360 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000361 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000362#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000366 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367
368 /* Ask for one less byte so we can terminate it */
369 assert(size > 0);
370 size--;
371
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000374 if (buf == NULL)
375 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000376 } else {
377 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000378 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 if (utf8 == NULL) {
382 utf8 = PyUnicode_AsUTF8String(buf);
383 Py_DECREF(buf);
384 if (utf8 == NULL)
385 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000387 str = PyString_AsString(utf8);
388 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391 if (tok->decoding_buffer == NULL) {
392 Py_DECREF(utf8);
393 return error_ret(tok);
394 }
395 utf8len = size;
396 }
397 memcpy(s, str, utf8len);
398 s[utf8len] = '\0';
399 Py_DECREF(utf8);
400 if (utf8len == 0) return NULL; /* EOF */
401 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000402#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000403}
404
405/* Set the readline function for TOK to a StreamReader's
406 readline function. The StreamReader is named ENC.
407
408 This function is called from check_bom and check_coding_spec.
409
410 ENC is usually identical to the future value of tok->encoding,
411 except for the (currently unsupported) case of UTF-16.
412
413 Return 1 on success, 0 on failure. */
414
415static int
416fp_setreadl(struct tok_state *tok, const char* enc)
417{
418 PyObject *reader, *stream, *readline;
419
Martin v. Löwis95292d62002-12-11 14:04:59 +0000420 /* XXX: constify filename argument. */
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000422 if (stream == NULL)
423 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
425 reader = PyCodec_StreamReader(enc, stream, NULL);
426 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000427 if (reader == NULL)
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429
430 readline = PyObject_GetAttrString(reader, "readline");
431 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000432 if (readline == NULL)
433 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000434
435 tok->decoding_readline = readline;
436 return 1;
437}
438
439/* Fetch the next byte from TOK. */
440
441static int fp_getc(struct tok_state *tok) {
442 return getc(tok->fp);
443}
444
445/* Unfetch the last byte back into TOK. */
446
447static void fp_ungetc(int c, struct tok_state *tok) {
448 ungetc(c, tok->fp);
449}
450
451/* Read a line of input from TOK. Determine encoding
452 if necessary. */
453
454static char *
455decoding_fgets(char *s, int size, struct tok_state *tok)
456{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000457 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000458 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000459 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460 if (tok->decoding_state < 0) {
461 /* We already have a codec associated with
462 this input. */
463 line = fp_readl(s, size, tok);
464 break;
465 } else if (tok->decoding_state > 0) {
466 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000467 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469 break;
470 } else {
471 /* We have not yet determined the encoding.
472 If an encoding is found, use the file-pointer
473 reader functions from now on. */
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475 return error_ret(tok);
476 assert(tok->decoding_state != 0);
477 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000478 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481 return error_ret(tok);
482 }
483 }
484#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000485 /* The default encoding is ASCII, so make sure we don't have any
486 non-ASCII bytes in it. */
487 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000489 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000490 if (*c > 127) {
491 badchar = *c;
492 break;
493 }
494 }
495 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000496 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000499 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000503 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000504 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000505 PyErr_SetString(PyExc_SyntaxError, buf);
506 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 }
508#endif
509 return line;
510}
511
512static int
513decoding_feof(struct tok_state *tok)
514{
515 if (tok->decoding_state >= 0) {
516 return feof(tok->fp);
517 } else {
518 PyObject* buf = tok->decoding_buffer;
519 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000520 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521 if (buf == NULL) {
522 error_ret(tok);
523 return 1;
524 } else {
525 tok->decoding_buffer = buf;
526 }
527 }
528 return PyObject_Length(buf) == 0;
529 }
530}
531
532/* Fetch a byte from TOK, using the string buffer. */
533
Tim Petersc9d78aa2006-03-26 23:27:58 +0000534static int
535buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000536 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537}
538
539/* Unfetch a byte from TOK, using the string buffer. */
540
Tim Petersc9d78aa2006-03-26 23:27:58 +0000541static void
542buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Set the readline function for TOK to ENC. For the string-based
548 tokenizer, this means to just record the encoding. */
549
Tim Petersc9d78aa2006-03-26 23:27:58 +0000550static int
551buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 tok->enc = enc;
553 return 1;
554}
555
556/* Return a UTF-8 encoding Python string object from the
557 C byte string STR, which is encoded with ENC. */
558
Martin v. Löwis019934b2002-08-07 12:33:18 +0000559#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560static PyObject *
561translate_into_utf8(const char* str, const char* enc) {
562 PyObject *utf8;
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564 if (buf == NULL)
565 return NULL;
566 utf8 = PyUnicode_AsUTF8String(buf);
567 Py_DECREF(buf);
568 return utf8;
569}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000570#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571
572/* Decode a byte string STR for use as the buffer of TOK.
573 Look for encoding declarations inside STR, and record them
574 inside TOK. */
575
576static const char *
577decode_str(const char *str, struct tok_state *tok)
578{
579 PyObject* utf8 = NULL;
580 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000581 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000586 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000588 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000589#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000594 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000600 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000601 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 lineno++;
603 if (lineno == 2) break;
604 }
605 }
606 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000607 /* need to check line 1 and 2 separately since check_coding_spec
608 assumes a single line as input */
609 if (newl[0]) {
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611 return error_ret(tok);
612 if (tok->enc == NULL && newl[1]) {
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614 tok, buf_setreadl))
615 return error_ret(tok);
616 }
617 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000618#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 if (tok->enc != NULL) {
620 assert(utf8 == NULL);
621 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000622 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000623 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000624 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000626#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627 assert(tok->decoding_buffer == NULL);
628 tok->decoding_buffer = utf8; /* CAUTION */
629 return str;
630}
631
632#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000633
634/* Set up tokenizer for string */
635
636struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000637PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000638{
639 struct tok_state *tok = tok_new();
640 if (tok == NULL)
641 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000643 if (str == NULL) {
644 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000646 }
647
Martin v. Löwis95292d62002-12-11 14:04:59 +0000648 /* XXX: constify members. */
649 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000650 return tok;
651}
652
653
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000654/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655
656struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000657PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658{
659 struct tok_state *tok = tok_new();
660 if (tok == NULL)
661 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000662 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000663 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000664 return NULL;
665 }
666 tok->cur = tok->inp = tok->buf;
667 tok->end = tok->buf + BUFSIZ;
668 tok->fp = fp;
669 tok->prompt = ps1;
670 tok->nextprompt = ps2;
671 return tok;
672}
673
674
675/* Free a tok_state structure */
676
677void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000678PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000679{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000680 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000681 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000682#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683 Py_XDECREF(tok->decoding_readline);
684 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000685#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000687 PyMem_FREE(tok->buf);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000688 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000689}
690
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000691#if !defined(PGEN) && defined(Py_USING_UNICODE)
692static int
693tok_stdin_decode(struct tok_state *tok, char **inp)
694{
695 PyObject *enc, *sysstdin, *decoded, *utf8;
696 const char *encoding;
697 char *converted;
698
699 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
700 return 0;
701 sysstdin = PySys_GetObject("stdin");
702 if (sysstdin == NULL || !PyFile_Check(sysstdin))
703 return 0;
704
705 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000706 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000707 return 0;
708 Py_INCREF(enc);
709
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000710 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000711 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
712 if (decoded == NULL)
713 goto error_clear;
714
715 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
716 Py_DECREF(decoded);
717 if (utf8 == NULL)
718 goto error_clear;
719
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000720 assert(PyString_Check(utf8));
721 converted = new_string(PyString_AS_STRING(utf8),
722 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000723 Py_DECREF(utf8);
724 if (converted == NULL)
725 goto error_nomem;
726
Neal Norwitz08062d62006-04-11 08:19:15 +0000727 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000728 *inp = converted;
729 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000730 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000731 tok->encoding = new_string(encoding, strlen(encoding));
732 if (tok->encoding == NULL)
733 goto error_nomem;
734
735 Py_DECREF(enc);
736 return 0;
737
738error_nomem:
739 Py_DECREF(enc);
740 tok->done = E_NOMEM;
741 return -1;
742
743error_clear:
744 /* Fallback to iso-8859-1: for backward compatibility */
745 Py_DECREF(enc);
746 PyErr_Clear();
747 return 0;
748}
749#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750
751/* Get next char, updating state; error code goes into tok->done */
752
753static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000754tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000757 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000758 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000759 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000760 if (tok->done != E_OK)
761 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000763 char *end = strchr(tok->inp, '\n');
764 if (end != NULL)
765 end++;
766 else {
767 end = strchr(tok->inp, '\0');
768 if (end == tok->inp) {
769 tok->done = E_EOF;
770 return EOF;
771 }
772 }
773 if (tok->start == NULL)
774 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000775 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000776 tok->lineno++;
777 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000778 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000781 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000782 if (tok->nextprompt != NULL)
783 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000784 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000785 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000786 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000787 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788 tok->done = E_EOF;
789 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000791 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000792 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000793#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000795 size_t start = tok->start - tok->buf;
796 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000797 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000798 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000799 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000800 tok->lineno++;
801 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000802 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000803 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000804 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000805 tok->done = E_NOMEM;
806 return EOF;
807 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 tok->buf = buf;
809 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000810 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000811 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000812 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000813 tok->inp = tok->buf + newlen;
814 tok->end = tok->inp + 1;
815 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000816 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 else {
818 tok->lineno++;
819 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000820 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000821 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000822 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000824 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000825 tok->inp = strchr(tok->buf, '\0');
826 tok->end = tok->inp + 1;
827 }
828 }
829 else {
830 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000831 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000832 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000833 if (tok->start == NULL) {
834 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000835 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000836 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000837 if (tok->buf == NULL) {
838 tok->done = E_NOMEM;
839 return EOF;
840 }
841 tok->end = tok->buf + BUFSIZ;
842 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000843 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
844 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000845 tok->done = E_EOF;
846 done = 1;
847 }
848 else {
849 tok->done = E_OK;
850 tok->inp = strchr(tok->buf, '\0');
851 done = tok->inp[-1] == '\n';
852 }
853 }
854 else {
855 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000856 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000857 tok->done = E_EOF;
858 done = 1;
859 }
860 else
861 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 }
863 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000864 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t curstart = tok->start == NULL ? -1 :
867 tok->start - tok->buf;
868 Py_ssize_t curvalid = tok->inp - tok->buf;
869 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000870 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000871 newbuf = (char *)PyMem_REALLOC(newbuf,
872 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 if (newbuf == NULL) {
874 tok->done = E_NOMEM;
875 tok->cur = tok->inp;
876 return EOF;
877 }
878 tok->buf = newbuf;
879 tok->inp = tok->buf + curvalid;
880 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000881 tok->start = curstart < 0 ? NULL :
882 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000883 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000884 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000885 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000886 /* Break out early on decoding
887 errors, as tok->buf will be NULL
888 */
889 if (tok->decoding_erred)
890 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000891 /* Last line does not end in \n,
892 fake one */
893 strcpy(tok->inp, "\n");
894 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000896 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000897 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000898 if (tok->buf != NULL) {
899 tok->cur = tok->buf + cur;
900 tok->line_start = tok->cur;
901 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000902 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000903 pt = tok->inp - 2;
904 if (pt >= tok->buf && *pt == '\r') {
905 *pt++ = '\n';
906 *pt = '\0';
907 tok->inp = pt;
908 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000909 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910 }
911 if (tok->done != E_OK) {
912 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000913 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000914 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000915 return EOF;
916 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000917 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000918 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919}
920
921
922/* Back-up one character */
923
924static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000925tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000926{
927 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000928 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000929 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000930 if (*tok->cur != c)
931 *tok->cur = c;
932 }
933}
934
935
936/* Return the token corresponding to a single character */
937
938int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000939PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000940{
941 switch (c) {
942 case '(': return LPAR;
943 case ')': return RPAR;
944 case '[': return LSQB;
945 case ']': return RSQB;
946 case ':': return COLON;
947 case ',': return COMMA;
948 case ';': return SEMI;
949 case '+': return PLUS;
950 case '-': return MINUS;
951 case '*': return STAR;
952 case '/': return SLASH;
953 case '|': return VBAR;
954 case '&': return AMPER;
955 case '<': return LESS;
956 case '>': return GREATER;
957 case '=': return EQUAL;
958 case '.': return DOT;
959 case '%': return PERCENT;
960 case '`': return BACKQUOTE;
961 case '{': return LBRACE;
962 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000963 case '^': return CIRCUMFLEX;
964 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000965 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000966 default: return OP;
967 }
968}
969
970
Guido van Rossumfbab9051991-10-20 20:25:03 +0000971int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000972PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000973{
974 switch (c1) {
975 case '=':
976 switch (c2) {
977 case '=': return EQEQUAL;
978 }
979 break;
980 case '!':
981 switch (c2) {
982 case '=': return NOTEQUAL;
983 }
984 break;
985 case '<':
986 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +0000987 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000988 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000989 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000990 }
991 break;
992 case '>':
993 switch (c2) {
994 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000995 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000996 }
997 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000998 case '+':
999 switch (c2) {
1000 case '=': return PLUSEQUAL;
1001 }
1002 break;
1003 case '-':
1004 switch (c2) {
1005 case '=': return MINEQUAL;
1006 }
1007 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001008 case '*':
1009 switch (c2) {
1010 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001011 case '=': return STAREQUAL;
1012 }
1013 break;
1014 case '/':
1015 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001016 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001017 case '=': return SLASHEQUAL;
1018 }
1019 break;
1020 case '|':
1021 switch (c2) {
1022 case '=': return VBAREQUAL;
1023 }
1024 break;
1025 case '%':
1026 switch (c2) {
1027 case '=': return PERCENTEQUAL;
1028 }
1029 break;
1030 case '&':
1031 switch (c2) {
1032 case '=': return AMPEREQUAL;
1033 }
1034 break;
1035 case '^':
1036 switch (c2) {
1037 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001038 }
1039 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001040 }
1041 return OP;
1042}
1043
Thomas Wouters434d0822000-08-24 20:11:32 +00001044int
1045PyToken_ThreeChars(int c1, int c2, int c3)
1046{
1047 switch (c1) {
1048 case '<':
1049 switch (c2) {
1050 case '<':
1051 switch (c3) {
1052 case '=':
1053 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001054 }
1055 break;
1056 }
1057 break;
1058 case '>':
1059 switch (c2) {
1060 case '>':
1061 switch (c3) {
1062 case '=':
1063 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001064 }
1065 break;
1066 }
1067 break;
1068 case '*':
1069 switch (c2) {
1070 case '*':
1071 switch (c3) {
1072 case '=':
1073 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001074 }
1075 break;
1076 }
1077 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001078 case '/':
1079 switch (c2) {
1080 case '/':
1081 switch (c3) {
1082 case '=':
1083 return DOUBLESLASHEQUAL;
1084 }
1085 break;
1086 }
1087 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001088 }
1089 return OP;
1090}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001091
Guido van Rossum926f13a1998-04-09 21:38:06 +00001092static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001093indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001094{
1095 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001096 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001097 tok->cur = tok->inp;
1098 return 1;
1099 }
1100 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001101 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1102 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001103 tok->altwarning = 0;
1104 }
1105 return 0;
1106}
1107
1108
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109/* Get next token, after space stripping etc. */
1110
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001111static int
1112tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001113{
1114 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001115 int blankline;
1116
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001117 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001119 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001120 blankline = 0;
1121
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122 /* Get indentation level */
1123 if (tok->atbol) {
1124 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127 for (;;) {
1128 c = tok_nextc(tok);
1129 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001130 col++, altcol++;
1131 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001132 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133 altcol = (altcol/tok->alttabsize + 1)
1134 * tok->alttabsize;
1135 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001136 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001137 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138 else
1139 break;
1140 }
1141 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001142 if (c == '#' || c == '\n') {
1143 /* Lines with only whitespace and/or comments
1144 shouldn't affect the indentation and are
1145 not passed to the parser as NEWLINE tokens,
1146 except *totally* empty lines in interactive
1147 mode, which signal the end of a command group. */
1148 if (col == 0 && c == '\n' && tok->prompt != NULL)
1149 blankline = 0; /* Let it through */
1150 else
1151 blankline = 1; /* Ignore completely */
1152 /* We can't jump back right here since we still
1153 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001154 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001155 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001156 if (col == tok->indstack[tok->indent]) {
1157 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001158 if (altcol != tok->altindstack[tok->indent]) {
1159 if (indenterror(tok))
1160 return ERRORTOKEN;
1161 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001162 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 else if (col > tok->indstack[tok->indent]) {
1164 /* Indent -- always one */
1165 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001166 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001167 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001168 return ERRORTOKEN;
1169 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170 if (altcol <= tok->altindstack[tok->indent]) {
1171 if (indenterror(tok))
1172 return ERRORTOKEN;
1173 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174 tok->pendin++;
1175 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001176 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001178 else /* col < tok->indstack[tok->indent] */ {
1179 /* Dedent -- any number, must be consistent */
1180 while (tok->indent > 0 &&
1181 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001182 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001183 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001184 }
1185 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001186 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001187 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001188 return ERRORTOKEN;
1189 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001190 if (altcol != tok->altindstack[tok->indent]) {
1191 if (indenterror(tok))
1192 return ERRORTOKEN;
1193 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 }
1195 }
1196 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001197
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001198 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001199
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 /* Return pending indents/dedents */
1201 if (tok->pendin != 0) {
1202 if (tok->pendin < 0) {
1203 tok->pendin++;
1204 return DEDENT;
1205 }
1206 else {
1207 tok->pendin--;
1208 return INDENT;
1209 }
1210 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001211
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001213 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001214 /* Skip spaces */
1215 do {
1216 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001217 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001218
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001219 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001220 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001221
Guido van Rossumab5ca152000-03-31 00:52:27 +00001222 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001224 static char *tabforms[] = {
1225 "tab-width:", /* Emacs */
1226 ":tabstop=", /* vim, full form */
1227 ":ts=", /* vim, abbreviated form */
1228 "set tabsize=", /* will vi never die? */
1229 /* more templates can be added here to support other editors */
1230 };
1231 char cbuf[80];
1232 char *tp, **cp;
1233 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001235 *tp++ = c = tok_nextc(tok);
1236 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001237 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001238 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001239 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001240 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1241 cp++) {
1242 if ((tp = strstr(cbuf, *cp))) {
1243 int newsize = atoi(tp + strlen(*cp));
1244
1245 if (newsize >= 1 && newsize <= 40) {
1246 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001247 if (Py_VerboseFlag)
1248 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001249 "Tab size set to %d\n",
1250 newsize);
1251 }
1252 }
1253 }
1254 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001257
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001259 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001260 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001261 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001262
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 /* Identifier (most frequent token!) */
1264 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001265 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001266 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001267 case 'b':
1268 case 'B':
1269 c = tok_nextc(tok);
1270 if (c == 'r' || c == 'R')
1271 c = tok_nextc(tok);
1272 if (c == '"' || c == '\'')
1273 goto letter_quote;
1274 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001275 case 'r':
1276 case 'R':
1277 c = tok_nextc(tok);
1278 if (c == '"' || c == '\'')
1279 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001280 break;
1281 case 'u':
1282 case 'U':
1283 c = tok_nextc(tok);
1284 if (c == 'r' || c == 'R')
1285 c = tok_nextc(tok);
1286 if (c == '"' || c == '\'')
1287 goto letter_quote;
1288 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001289 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001290 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001292 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001294 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 *p_end = tok->cur;
1296 return NAME;
1297 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001298
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 /* Newline */
1300 if (c == '\n') {
1301 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001302 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001303 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001304 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001306 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001307 return NEWLINE;
1308 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001309
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001310 /* Period or number starting with period? */
1311 if (c == '.') {
1312 c = tok_nextc(tok);
1313 if (isdigit(c)) {
1314 goto fraction;
1315 }
1316 else {
1317 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001318 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001319 *p_end = tok->cur;
1320 return DOT;
1321 }
1322 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001323
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 /* Number */
1325 if (isdigit(c)) {
1326 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001327 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 c = tok_nextc(tok);
1329 if (c == '.')
1330 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001331#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001332 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001333 goto imaginary;
1334#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001336
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001337 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001338 c = tok_nextc(tok);
1339 if (!isxdigit(c)) {
1340 tok->done = E_TOKEN;
1341 tok_backup(tok, c);
1342 return ERRORTOKEN;
1343 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 do {
1345 c = tok_nextc(tok);
1346 } while (isxdigit(c));
1347 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001348 else if (c == 'o' || c == 'O') {
1349 /* Octal */
1350 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001351 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001352 tok->done = E_TOKEN;
1353 tok_backup(tok, c);
1354 return ERRORTOKEN;
1355 }
1356 do {
1357 c = tok_nextc(tok);
1358 } while ('0' <= c && c < '8');
1359 }
1360 else if (c == 'b' || c == 'B') {
1361 /* Binary */
1362 c = tok_nextc(tok);
1363 if (c != '0' && c != '1') {
1364 tok->done = E_TOKEN;
1365 tok_backup(tok, c);
1366 return ERRORTOKEN;
1367 }
1368 do {
1369 c = tok_nextc(tok);
1370 } while (c == '0' || c == '1');
1371 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001373 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 /* Octal; c is first char of it */
1375 /* There's no 'isoctdigit' macro, sigh */
1376 while ('0' <= c && c < '8') {
1377 c = tok_nextc(tok);
1378 }
Tim Petersd507dab2001-08-30 20:51:59 +00001379 if (isdigit(c)) {
1380 found_decimal = 1;
1381 do {
1382 c = tok_nextc(tok);
1383 } while (isdigit(c));
1384 }
1385 if (c == '.')
1386 goto fraction;
1387 else if (c == 'e' || c == 'E')
1388 goto exponent;
1389#ifndef WITHOUT_COMPLEX
1390 else if (c == 'j' || c == 'J')
1391 goto imaginary;
1392#endif
1393 else if (found_decimal) {
1394 tok->done = E_TOKEN;
1395 tok_backup(tok, c);
1396 return ERRORTOKEN;
1397 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001398 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001399 if (c == 'l' || c == 'L')
1400 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 }
1402 else {
1403 /* Decimal */
1404 do {
1405 c = tok_nextc(tok);
1406 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001407 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001409 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001410 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001411 if (c == '.') {
1412 fraction:
1413 /* Fraction */
1414 do {
1415 c = tok_nextc(tok);
1416 } while (isdigit(c));
1417 }
1418 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001419 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001420 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001422 if (c == '+' || c == '-')
1423 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001424 if (!isdigit(c)) {
1425 tok->done = E_TOKEN;
1426 tok_backup(tok, c);
1427 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001428 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001429 do {
1430 c = tok_nextc(tok);
1431 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001432 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001433#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001434 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001435 /* Imaginary part */
1436 imaginary:
1437 c = tok_nextc(tok);
1438#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001439 }
1440 }
1441 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001442 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001443 *p_end = tok->cur;
1444 return NUMBER;
1445 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001446
1447 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001448 /* String */
1449 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001450 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001451 int quote = c;
1452 int triple = 0;
1453 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001454 for (;;) {
1455 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001456 if (c == '\n') {
1457 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001458 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001459 tok_backup(tok, c);
1460 return ERRORTOKEN;
1461 }
1462 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001463 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001464 }
1465 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001466 if (triple)
1467 tok->done = E_EOFS;
1468 else
1469 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001470 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001471 return ERRORTOKEN;
1472 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001473 else if (c == quote) {
1474 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001475 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001476 c = tok_nextc(tok);
1477 if (c == quote) {
1478 triple = 1;
1479 tripcount = 0;
1480 continue;
1481 }
1482 tok_backup(tok, c);
1483 }
1484 if (!triple || tripcount == 3)
1485 break;
1486 }
1487 else if (c == '\\') {
1488 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001489 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001490 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001491 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001492 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001493 return ERRORTOKEN;
1494 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001495 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001496 else
1497 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001499 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001500 *p_end = tok->cur;
1501 return STRING;
1502 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001503
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504 /* Line continuation */
1505 if (c == '\\') {
1506 c = tok_nextc(tok);
1507 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001508 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001509 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510 return ERRORTOKEN;
1511 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001512 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513 goto again; /* Read next line */
1514 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001515
Guido van Rossumfbab9051991-10-20 20:25:03 +00001516 /* Check for two-character token */
1517 {
1518 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001519 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001520#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001521 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001522 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001523 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001524 tok->filename, tok->lineno,
1525 NULL, NULL)) {
1526 return ERRORTOKEN;
1527 }
1528 }
1529#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001530 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001531 int c3 = tok_nextc(tok);
1532 int token3 = PyToken_ThreeChars(c, c2, c3);
1533 if (token3 != OP) {
1534 token = token3;
1535 } else {
1536 tok_backup(tok, c3);
1537 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001538 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001539 *p_end = tok->cur;
1540 return token;
1541 }
1542 tok_backup(tok, c2);
1543 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001544
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001545 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001546 switch (c) {
1547 case '(':
1548 case '[':
1549 case '{':
1550 tok->level++;
1551 break;
1552 case ')':
1553 case ']':
1554 case '}':
1555 tok->level--;
1556 break;
1557 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001558
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001559 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001560 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001561 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001562 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001563}
1564
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001565int
1566PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1567{
1568 int result = tok_get(tok, p_start, p_end);
1569 if (tok->decoding_erred) {
1570 result = ERRORTOKEN;
1571 tok->done = E_DECODE;
1572 }
1573 return result;
1574}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001575
Martin v. Löwisa5136192007-09-04 14:19:28 +00001576/* This function is only called from parsetok. However, it cannot live
1577 there, as it must be empty for PGEN, and we can check for PGEN only
1578 in this file. */
1579
Christian Heimes082c9b02008-01-23 14:20:50 +00001580#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001581char*
1582PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1583{
1584 return NULL;
1585}
1586#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001587#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001588static PyObject *
1589dec_utf8(const char *enc, const char *text, size_t len) {
1590 PyObject *ret = NULL;
1591 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1592 if (unicode_text) {
1593 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1594 Py_DECREF(unicode_text);
1595 }
1596 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001597 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001598 }
1599 return ret;
1600}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001601char *
1602PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1603{
1604 char *text = NULL;
1605 if (tok->encoding) {
1606 /* convert source to original encondig */
1607 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1608 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001609 int linelen = PyString_Size(lineobj);
1610 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001611 text = PyObject_MALLOC(linelen + 1);
1612 if (text != NULL && line != NULL) {
1613 if (linelen)
1614 strncpy(text, line, linelen);
1615 text[linelen] = '\0';
1616 }
1617 Py_DECREF(lineobj);
1618
1619 /* adjust error offset */
1620 if (*offset > 1) {
1621 PyObject *offsetobj = dec_utf8(tok->encoding,
1622 tok->buf, *offset-1);
1623 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001624 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001625 Py_DECREF(offsetobj);
1626 }
1627 }
1628
1629 }
1630 }
1631 return text;
1632
1633}
Georg Brandl76b30d12008-01-07 18:41:34 +00001634#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001635#endif
1636
Martin v. Löwisa5136192007-09-04 14:19:28 +00001637
Guido van Rossum408027e1996-12-30 16:17:54 +00001638#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001639
1640void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001641tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001642{
Guido van Rossum86bea461997-04-29 21:03:06 +00001643 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001644 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1645 printf("(%.*s)", (int)(end - start), start);
1646}
1647
1648#endif