blob: d0e4a0e2745ab2f627e52dc6bb1d2773e1cebf47 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093};
94
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000098tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000130 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131}
132
Benjamin Petersone36199b2009-11-12 23:39:44 +0000133static char *
134new_string(const char *s, Py_ssize_t len)
135{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000142}
143
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000155 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000156}
157
Benjamin Petersone36199b2009-11-12 23:39:44 +0000158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000161 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176
177static char *
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000178get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000230
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000231 begin = t;
232 while (Py_ISALNUM(t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000257 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000259 char * cs;
260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300262 if (tok->cont_line) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000263 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300264 tok->read_coding_spec = 1;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000265 return 1;
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300266 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000267 cs = get_coding_spec(line, size);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300268 if (!cs) {
269 Py_ssize_t i;
270 for (i = 0; i < size; i++) {
271 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
272 break;
273 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
274 /* Stop checking coding spec after a line containing
275 * anything except a comment. */
276 tok->read_coding_spec = 1;
277 break;
278 }
279 }
280 } else {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
283 assert(tok->decoding_state == 1); /* raw */
284 if (strcmp(cs, "utf-8") == 0 ||
285 strcmp(cs, "iso-8859-1") == 0) {
286 tok->encoding = cs;
287 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = -1;
293 }
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300294 else {
295 PyErr_Format(PyExc_SyntaxError,
296 "encoding problem: %s", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000297 PyMem_FREE(cs);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300298 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000299#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000300 /* Without Unicode support, we cannot
301 process the coding spec. Since there
302 won't be any Unicode literals, that
303 won't matter. */
304 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000305#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000306 }
307 } else { /* then, compare cs with BOM */
308 r = (strcmp(tok->encoding, cs) == 0);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300309 if (!r)
310 PyErr_Format(PyExc_SyntaxError,
311 "encoding problem: %s with BOM", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000312 PyMem_FREE(cs);
313 }
314 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000315 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000316}
317
318/* See whether the file starts with a BOM. If it does,
319 invoke the set_readline function with the new encoding.
320 Return 1 on success, 0 on failure. */
321
322static int
323check_bom(int get_char(struct tok_state *),
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000324 void unget_char(int, struct tok_state *),
325 int set_readline(struct tok_state *, const char *),
326 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000327{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000328 int ch1, ch2, ch3;
329 ch1 = get_char(tok);
330 tok->decoding_state = 1;
331 if (ch1 == EOF) {
332 return 1;
333 } else if (ch1 == 0xEF) {
334 ch2 = get_char(tok);
335 if (ch2 != 0xBB) {
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
340 ch3 = get_char(tok);
341 if (ch3 != 0xBF) {
342 unget_char(ch3, tok);
343 unget_char(ch2, tok);
344 unget_char(ch1, tok);
345 return 1;
346 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000347#if 0
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000348 /* Disable support for UTF-16 BOMs until a decision
349 is made whether this needs to be supported. */
350 } else if (ch1 == 0xFE) {
351 ch2 = get_char(tok);
352 if (ch2 != 0xFF) {
353 unget_char(ch2, tok);
354 unget_char(ch1, tok);
355 return 1;
356 }
357 if (!set_readline(tok, "utf-16-be"))
358 return 0;
359 tok->decoding_state = -1;
360 } else if (ch1 == 0xFF) {
361 ch2 = get_char(tok);
362 if (ch2 != 0xFE) {
363 unget_char(ch2, tok);
364 unget_char(ch1, tok);
365 return 1;
366 }
367 if (!set_readline(tok, "utf-16-le"))
368 return 0;
369 tok->decoding_state = -1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000371 } else {
372 unget_char(ch1, tok);
373 return 1;
374 }
375 if (tok->encoding != NULL)
376 PyMem_FREE(tok->encoding);
377 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
378 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000379}
380
381/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000382 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000383
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000384 On entry, tok->decoding_buffer will be one of:
385 1) NULL: need to call tok->decoding_readline to get a new line
386 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000387 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000388 3) PyStringObject *: previous call to fp_readl did not have enough room
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000389 (in the s buffer) to copy entire contents of the line read
390 by tok->decoding_readline. tok->decoding_buffer has the overflow.
391 In this case, fp_readl is called in a loop (with an expanded buffer)
392 until the buffer ends with a '\n' (or until the end of the file is
393 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395
396static char *
397fp_readl(char *s, int size, struct tok_state *tok)
398{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000399#ifndef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000400 /* In a non-Unicode built, this should never be called. */
401 Py_FatalError("fp_readl should not be called in this build.");
402 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000403#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000404 PyObject* utf8 = NULL;
405 PyObject* buf = tok->decoding_buffer;
406 char *str;
407 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000408
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000409 /* Ask for one less byte so we can terminate it */
410 assert(size > 0);
411 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000412
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000413 if (buf == NULL) {
414 buf = PyObject_CallObject(tok->decoding_readline, NULL);
415 if (buf == NULL)
416 return error_ret(tok);
Benjamin Peterson22d9ee72013-12-28 10:33:58 -0600417 if (!PyUnicode_Check(buf)) {
418 Py_DECREF(buf);
419 PyErr_SetString(PyExc_SyntaxError,
420 "codec did not return a unicode object");
421 return error_ret(tok);
422 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000423 } else {
424 tok->decoding_buffer = NULL;
425 if (PyString_CheckExact(buf))
426 utf8 = buf;
427 }
428 if (utf8 == NULL) {
429 utf8 = PyUnicode_AsUTF8String(buf);
430 Py_DECREF(buf);
431 if (utf8 == NULL)
432 return error_ret(tok);
433 }
434 str = PyString_AsString(utf8);
435 utf8len = PyString_GET_SIZE(utf8);
436 if (utf8len > size) {
437 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
438 if (tok->decoding_buffer == NULL) {
439 Py_DECREF(utf8);
440 return error_ret(tok);
441 }
442 utf8len = size;
443 }
444 memcpy(s, str, utf8len);
445 s[utf8len] = '\0';
446 Py_DECREF(utf8);
447 if (utf8len == 0)
448 return NULL; /* EOF */
449 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000450#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000451}
452
453/* Set the readline function for TOK to a StreamReader's
454 readline function. The StreamReader is named ENC.
455
456 This function is called from check_bom and check_coding_spec.
457
458 ENC is usually identical to the future value of tok->encoding,
459 except for the (currently unsupported) case of UTF-16.
460
461 Return 1 on success, 0 on failure. */
462
463static int
464fp_setreadl(struct tok_state *tok, const char* enc)
465{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000466 PyObject *reader, *stream, *readline;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000468 /* XXX: constify filename argument. */
469 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
470 if (stream == NULL)
471 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000473 reader = PyCodec_StreamReader(enc, stream, NULL);
474 Py_DECREF(stream);
475 if (reader == NULL)
476 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000478 readline = PyObject_GetAttrString(reader, "readline");
479 Py_DECREF(reader);
480 if (readline == NULL)
481 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000483 tok->decoding_readline = readline;
484 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485}
486
487/* Fetch the next byte from TOK. */
488
489static int fp_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000490 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000491}
492
493/* Unfetch the last byte back into TOK. */
494
495static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000496 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000497}
498
499/* Read a line of input from TOK. Determine encoding
500 if necessary. */
501
502static char *
503decoding_fgets(char *s, int size, struct tok_state *tok)
504{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000505 char *line = NULL;
506 int badchar = 0;
507 for (;;) {
508 if (tok->decoding_state < 0) {
509 /* We already have a codec associated with
510 this input. */
511 line = fp_readl(s, size, tok);
512 break;
513 } else if (tok->decoding_state > 0) {
514 /* We want a 'raw' read. */
515 line = Py_UniversalNewlineFgets(s, size,
516 tok->fp, NULL);
517 break;
518 } else {
519 /* We have not yet determined the encoding.
520 If an encoding is found, use the file-pointer
521 reader functions from now on. */
522 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
523 return error_ret(tok);
524 assert(tok->decoding_state != 0);
525 }
526 }
527 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
528 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
529 return error_ret(tok);
530 }
531 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000533 /* The default encoding is ASCII, so make sure we don't have any
534 non-ASCII bytes in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 for (c = (unsigned char *)line; *c; c++)
538 if (*c > 127) {
539 badchar = *c;
540 break;
541 }
542 }
543 if (badchar) {
544 char buf[500];
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
547 sprintf(buf,
548 "Non-ASCII character '\\x%.2x' "
549 "in file %.200s on line %i, "
550 "but no encoding declared; "
Ned Deily24b82092014-06-17 12:24:53 -0700551 "see http://python.org/dev/peps/pep-0263/ for details",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000552 badchar, tok->filename, tok->lineno + 1);
553 PyErr_SetString(PyExc_SyntaxError, buf);
554 return error_ret(tok);
555 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000557 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558}
559
560static int
561decoding_feof(struct tok_state *tok)
562{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000563 if (tok->decoding_state >= 0) {
564 return feof(tok->fp);
565 } else {
566 PyObject* buf = tok->decoding_buffer;
567 if (buf == NULL) {
568 buf = PyObject_CallObject(tok->decoding_readline, NULL);
569 if (buf == NULL) {
570 error_ret(tok);
571 return 1;
572 } else {
573 tok->decoding_buffer = buf;
574 }
575 }
576 return PyObject_Length(buf) == 0;
577 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578}
579
580/* Fetch a byte from TOK, using the string buffer. */
581
Tim Petersc9d78aa2006-03-26 23:27:58 +0000582static int
583buf_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000584 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585}
586
587/* Unfetch a byte from TOK, using the string buffer. */
588
Tim Petersc9d78aa2006-03-26 23:27:58 +0000589static void
590buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000591 tok->str--;
592 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593}
594
595/* Set the readline function for TOK to ENC. For the string-based
596 tokenizer, this means to just record the encoding. */
597
Tim Petersc9d78aa2006-03-26 23:27:58 +0000598static int
599buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000600 tok->enc = enc;
601 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602}
603
604/* Return a UTF-8 encoding Python string object from the
605 C byte string STR, which is encoded with ENC. */
606
Martin v. Löwis019934b2002-08-07 12:33:18 +0000607#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608static PyObject *
609translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000610 PyObject *utf8;
611 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
612 if (buf == NULL)
613 return NULL;
614 utf8 = PyUnicode_AsUTF8String(buf);
615 Py_DECREF(buf);
616 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000617}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000618#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619
Benjamin Petersone36199b2009-11-12 23:39:44 +0000620
621static char *
622translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000623 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
624 char *buf, *current;
625 char c = '\0';
626 buf = PyMem_MALLOC(needed_length);
627 if (buf == NULL) {
628 tok->done = E_NOMEM;
629 return NULL;
630 }
631 for (current = buf; *s; s++, current++) {
632 c = *s;
633 if (skip_next_lf) {
634 skip_next_lf = 0;
635 if (c == '\n') {
636 c = *++s;
637 if (!c)
638 break;
639 }
640 }
641 if (c == '\r') {
642 skip_next_lf = 1;
643 c = '\n';
644 }
645 *current = c;
646 }
647 /* If this is exec input, add a newline to the end of the string if
648 there isn't one already. */
649 if (exec_input && c != '\n') {
650 *current = '\n';
651 current++;
652 }
653 *current = '\0';
654 final_length = current - buf + 1;
655 if (final_length < needed_length && final_length)
656 /* should never fail */
657 buf = PyMem_REALLOC(buf, final_length);
658 return buf;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000659}
660
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661/* Decode a byte string STR for use as the buffer of TOK.
662 Look for encoding declarations inside STR, and record them
663 inside TOK. */
664
665static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000666decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000668 PyObject* utf8 = NULL;
669 const char *str;
670 const char *s;
671 const char *newl[2] = {NULL, NULL};
672 int lineno = 0;
673 tok->input = str = translate_newlines(input, single, tok);
674 if (str == NULL)
675 return NULL;
676 tok->enc = NULL;
677 tok->str = str;
678 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
679 return error_ret(tok);
680 str = tok->str; /* string after BOM if any */
681 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000682#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000683 if (tok->enc != NULL) {
684 utf8 = translate_into_utf8(str, tok->enc);
685 if (utf8 == NULL)
686 return error_ret(tok);
687 str = PyString_AsString(utf8);
688 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000689#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000690 for (s = str;; s++) {
691 if (*s == '\0') break;
692 else if (*s == '\n') {
693 assert(lineno < 2);
694 newl[lineno] = s;
695 lineno++;
696 if (lineno == 2) break;
697 }
698 }
699 tok->enc = NULL;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
702 if (newl[0]) {
703 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704 return error_ret(tok);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300705 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000706 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707 tok, buf_setreadl))
708 return error_ret(tok);
709 }
710 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000711#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000712 if (tok->enc != NULL) {
713 assert(utf8 == NULL);
714 utf8 = translate_into_utf8(str, tok->enc);
715 if (utf8 == NULL)
716 return error_ret(tok);
717 str = PyString_AsString(utf8);
718 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000719#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000720 assert(tok->decoding_buffer == NULL);
721 tok->decoding_buffer = utf8; /* CAUTION */
722 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000723}
724
725#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726
727/* Set up tokenizer for string */
728
729struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000730PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000732 struct tok_state *tok = tok_new();
733 if (tok == NULL)
734 return NULL;
735 str = (char *)decode_str(str, exec_input, tok);
736 if (str == NULL) {
737 PyTokenizer_Free(tok);
738 return NULL;
739 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000740
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000741 /* XXX: constify members. */
742 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
743 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744}
745
746
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000747/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748
749struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000750PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000752 struct tok_state *tok = tok_new();
753 if (tok == NULL)
754 return NULL;
755 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
756 PyTokenizer_Free(tok);
757 return NULL;
758 }
759 tok->cur = tok->inp = tok->buf;
760 tok->end = tok->buf + BUFSIZ;
761 tok->fp = fp;
762 tok->prompt = ps1;
763 tok->nextprompt = ps2;
764 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765}
766
767
768/* Free a tok_state structure */
769
770void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000771PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000773 if (tok->encoding != NULL)
774 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000775#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000776 Py_XDECREF(tok->decoding_readline);
777 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000778#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000779 if (tok->fp != NULL && tok->buf != NULL)
780 PyMem_FREE(tok->buf);
781 if (tok->input)
782 PyMem_FREE((char *)tok->input);
783 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000784}
785
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000786#if !defined(PGEN) && defined(Py_USING_UNICODE)
787static int
788tok_stdin_decode(struct tok_state *tok, char **inp)
789{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000790 PyObject *enc, *sysstdin, *decoded, *utf8;
791 const char *encoding;
792 char *converted;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000793
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000794 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
795 return 0;
796 sysstdin = PySys_GetObject("stdin");
797 if (sysstdin == NULL || !PyFile_Check(sysstdin))
798 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000799
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000800 enc = ((PyFileObject *)sysstdin)->f_encoding;
801 if (enc == NULL || !PyString_Check(enc))
802 return 0;
803 Py_INCREF(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000804
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000805 encoding = PyString_AsString(enc);
806 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
807 if (decoded == NULL)
808 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000809
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000810 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
811 Py_DECREF(decoded);
812 if (utf8 == NULL)
813 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000814
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000815 assert(PyString_Check(utf8));
816 converted = new_string(PyString_AS_STRING(utf8),
817 PyString_GET_SIZE(utf8));
818 Py_DECREF(utf8);
819 if (converted == NULL)
820 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000821
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000822 PyMem_FREE(*inp);
823 *inp = converted;
824 if (tok->encoding != NULL)
825 PyMem_FREE(tok->encoding);
826 tok->encoding = new_string(encoding, strlen(encoding));
827 if (tok->encoding == NULL)
828 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000829
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000830 Py_DECREF(enc);
831 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000832
833error_nomem:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000834 Py_DECREF(enc);
835 tok->done = E_NOMEM;
836 return -1;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000837
838error_clear:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000839 Py_DECREF(enc);
840 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
841 tok->done = E_ERROR;
842 return -1;
843 }
844 /* Fallback to iso-8859-1: for backward compatibility */
845 PyErr_Clear();
846 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000847}
848#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849
850/* Get next char, updating state; error code goes into tok->done */
851
852static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000853tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000855 for (;;) {
856 if (tok->cur != tok->inp) {
857 return Py_CHARMASK(*tok->cur++); /* Fast path */
858 }
859 if (tok->done != E_OK)
860 return EOF;
861 if (tok->fp == NULL) {
862 char *end = strchr(tok->inp, '\n');
863 if (end != NULL)
864 end++;
865 else {
866 end = strchr(tok->inp, '\0');
867 if (end == tok->inp) {
868 tok->done = E_EOF;
869 return EOF;
870 }
871 }
872 if (tok->start == NULL)
873 tok->buf = tok->cur;
874 tok->line_start = tok->cur;
875 tok->lineno++;
876 tok->inp = end;
877 return Py_CHARMASK(*tok->cur++);
878 }
879 if (tok->prompt != NULL) {
880 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
881 if (tok->nextprompt != NULL)
882 tok->prompt = tok->nextprompt;
883 if (newtok == NULL)
884 tok->done = E_INTR;
885 else if (*newtok == '\0') {
886 PyMem_FREE(newtok);
887 tok->done = E_EOF;
888 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000889#if !defined(PGEN) && defined(Py_USING_UNICODE)
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000890 else if (tok_stdin_decode(tok, &newtok) != 0)
891 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000892#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000893 else if (tok->start != NULL) {
894 size_t start = tok->start - tok->buf;
895 size_t oldlen = tok->cur - tok->buf;
896 size_t newlen = oldlen + strlen(newtok);
897 char *buf = tok->buf;
898 buf = (char *)PyMem_REALLOC(buf, newlen+1);
899 tok->lineno++;
900 if (buf == NULL) {
901 PyMem_FREE(tok->buf);
902 tok->buf = NULL;
903 PyMem_FREE(newtok);
904 tok->done = E_NOMEM;
905 return EOF;
906 }
907 tok->buf = buf;
908 tok->cur = tok->buf + oldlen;
909 tok->line_start = tok->cur;
910 strcpy(tok->buf + oldlen, newtok);
911 PyMem_FREE(newtok);
912 tok->inp = tok->buf + newlen;
913 tok->end = tok->inp + 1;
914 tok->start = tok->buf + start;
915 }
916 else {
917 tok->lineno++;
918 if (tok->buf != NULL)
919 PyMem_FREE(tok->buf);
920 tok->buf = newtok;
921 tok->line_start = tok->buf;
922 tok->cur = tok->buf;
923 tok->line_start = tok->buf;
924 tok->inp = strchr(tok->buf, '\0');
925 tok->end = tok->inp + 1;
926 }
927 }
928 else {
929 int done = 0;
930 Py_ssize_t cur = 0;
931 char *pt;
932 if (tok->start == NULL) {
933 if (tok->buf == NULL) {
934 tok->buf = (char *)
935 PyMem_MALLOC(BUFSIZ);
936 if (tok->buf == NULL) {
937 tok->done = E_NOMEM;
938 return EOF;
939 }
940 tok->end = tok->buf + BUFSIZ;
941 }
942 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
943 tok) == NULL) {
944 tok->done = E_EOF;
945 done = 1;
946 }
947 else {
948 tok->done = E_OK;
949 tok->inp = strchr(tok->buf, '\0');
950 done = tok->inp[-1] == '\n';
951 }
952 }
953 else {
954 cur = tok->cur - tok->buf;
955 if (decoding_feof(tok)) {
956 tok->done = E_EOF;
957 done = 1;
958 }
959 else
960 tok->done = E_OK;
961 }
962 tok->lineno++;
963 /* Read until '\n' or EOF */
964 while (!done) {
965 Py_ssize_t curstart = tok->start == NULL ? -1 :
966 tok->start - tok->buf;
967 Py_ssize_t curvalid = tok->inp - tok->buf;
968 Py_ssize_t newsize = curvalid + BUFSIZ;
969 char *newbuf = tok->buf;
970 newbuf = (char *)PyMem_REALLOC(newbuf,
971 newsize);
972 if (newbuf == NULL) {
973 tok->done = E_NOMEM;
974 tok->cur = tok->inp;
975 return EOF;
976 }
977 tok->buf = newbuf;
978 tok->inp = tok->buf + curvalid;
979 tok->end = tok->buf + newsize;
980 tok->start = curstart < 0 ? NULL :
981 tok->buf + curstart;
982 if (decoding_fgets(tok->inp,
983 (int)(tok->end - tok->inp),
984 tok) == NULL) {
985 /* Break out early on decoding
986 errors, as tok->buf will be NULL
987 */
988 if (tok->decoding_erred)
989 return EOF;
990 /* Last line does not end in \n,
991 fake one */
992 strcpy(tok->inp, "\n");
993 }
994 tok->inp = strchr(tok->inp, '\0');
995 done = tok->inp[-1] == '\n';
996 }
997 if (tok->buf != NULL) {
998 tok->cur = tok->buf + cur;
999 tok->line_start = tok->cur;
1000 /* replace "\r\n" with "\n" */
1001 /* For Mac leave the \r, giving a syntax error */
1002 pt = tok->inp - 2;
1003 if (pt >= tok->buf && *pt == '\r') {
1004 *pt++ = '\n';
1005 *pt = '\0';
1006 tok->inp = pt;
1007 }
1008 }
1009 }
1010 if (tok->done != E_OK) {
1011 if (tok->prompt != NULL)
1012 PySys_WriteStderr("\n");
1013 tok->cur = tok->inp;
1014 return EOF;
1015 }
1016 }
1017 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001018}
1019
1020
1021/* Back-up one character */
1022
1023static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001024tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001026 if (c != EOF) {
1027 if (--tok->cur < tok->buf)
1028 Py_FatalError("tok_backup: beginning of buffer");
1029 if (*tok->cur != c)
1030 *tok->cur = c;
1031 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032}
1033
1034
1035/* Return the token corresponding to a single character */
1036
1037int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001038PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001040 switch (c) {
1041 case '(': return LPAR;
1042 case ')': return RPAR;
1043 case '[': return LSQB;
1044 case ']': return RSQB;
1045 case ':': return COLON;
1046 case ',': return COMMA;
1047 case ';': return SEMI;
1048 case '+': return PLUS;
1049 case '-': return MINUS;
1050 case '*': return STAR;
1051 case '/': return SLASH;
1052 case '|': return VBAR;
1053 case '&': return AMPER;
1054 case '<': return LESS;
1055 case '>': return GREATER;
1056 case '=': return EQUAL;
1057 case '.': return DOT;
1058 case '%': return PERCENT;
1059 case '`': return BACKQUOTE;
1060 case '{': return LBRACE;
1061 case '}': return RBRACE;
1062 case '^': return CIRCUMFLEX;
1063 case '~': return TILDE;
1064 case '@': return AT;
1065 default: return OP;
1066 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001067}
1068
1069
Guido van Rossumfbab9051991-10-20 20:25:03 +00001070int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001071PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001072{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001073 switch (c1) {
1074 case '=':
1075 switch (c2) {
1076 case '=': return EQEQUAL;
1077 }
1078 break;
1079 case '!':
1080 switch (c2) {
1081 case '=': return NOTEQUAL;
1082 }
1083 break;
1084 case '<':
1085 switch (c2) {
1086 case '>': return NOTEQUAL;
1087 case '=': return LESSEQUAL;
1088 case '<': return LEFTSHIFT;
1089 }
1090 break;
1091 case '>':
1092 switch (c2) {
1093 case '=': return GREATEREQUAL;
1094 case '>': return RIGHTSHIFT;
1095 }
1096 break;
1097 case '+':
1098 switch (c2) {
1099 case '=': return PLUSEQUAL;
1100 }
1101 break;
1102 case '-':
1103 switch (c2) {
1104 case '=': return MINEQUAL;
1105 }
1106 break;
1107 case '*':
1108 switch (c2) {
1109 case '*': return DOUBLESTAR;
1110 case '=': return STAREQUAL;
1111 }
1112 break;
1113 case '/':
1114 switch (c2) {
1115 case '/': return DOUBLESLASH;
1116 case '=': return SLASHEQUAL;
1117 }
1118 break;
1119 case '|':
1120 switch (c2) {
1121 case '=': return VBAREQUAL;
1122 }
1123 break;
1124 case '%':
1125 switch (c2) {
1126 case '=': return PERCENTEQUAL;
1127 }
1128 break;
1129 case '&':
1130 switch (c2) {
1131 case '=': return AMPEREQUAL;
1132 }
1133 break;
1134 case '^':
1135 switch (c2) {
1136 case '=': return CIRCUMFLEXEQUAL;
1137 }
1138 break;
1139 }
1140 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001141}
1142
Thomas Wouters434d0822000-08-24 20:11:32 +00001143int
1144PyToken_ThreeChars(int c1, int c2, int c3)
1145{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001146 switch (c1) {
1147 case '<':
1148 switch (c2) {
1149 case '<':
1150 switch (c3) {
1151 case '=':
1152 return LEFTSHIFTEQUAL;
1153 }
1154 break;
1155 }
1156 break;
1157 case '>':
1158 switch (c2) {
1159 case '>':
1160 switch (c3) {
1161 case '=':
1162 return RIGHTSHIFTEQUAL;
1163 }
1164 break;
1165 }
1166 break;
1167 case '*':
1168 switch (c2) {
1169 case '*':
1170 switch (c3) {
1171 case '=':
1172 return DOUBLESTAREQUAL;
1173 }
1174 break;
1175 }
1176 break;
1177 case '/':
1178 switch (c2) {
1179 case '/':
1180 switch (c3) {
1181 case '=':
1182 return DOUBLESLASHEQUAL;
1183 }
1184 break;
1185 }
1186 break;
1187 }
1188 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001189}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001190
Guido van Rossum926f13a1998-04-09 21:38:06 +00001191static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001192indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001193{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001194 if (tok->alterror) {
1195 tok->done = E_TABSPACE;
1196 tok->cur = tok->inp;
1197 return 1;
1198 }
1199 if (tok->altwarning) {
1200 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1201 "in indentation\n", tok->filename);
1202 tok->altwarning = 0;
1203 }
1204 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001205}
1206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207/* Get next token, after space stripping etc. */
1208
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001209static int
1210tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001212 register int c;
1213 int blankline;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001214
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001215 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001216 nextline:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001217 tok->start = NULL;
1218 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001219
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001220 /* Get indentation level */
1221 if (tok->atbol) {
1222 register int col = 0;
1223 register int altcol = 0;
1224 tok->atbol = 0;
1225 for (;;) {
1226 c = tok_nextc(tok);
1227 if (c == ' ')
1228 col++, altcol++;
1229 else if (c == '\t') {
1230 col = (col/tok->tabsize + 1) * tok->tabsize;
1231 altcol = (altcol/tok->alttabsize + 1)
1232 * tok->alttabsize;
1233 }
1234 else if (c == '\014') /* Control-L (formfeed) */
1235 col = altcol = 0; /* For Emacs users */
1236 else
1237 break;
1238 }
1239 tok_backup(tok, c);
1240 if (c == '#' || c == '\n') {
1241 /* Lines with only whitespace and/or comments
1242 shouldn't affect the indentation and are
1243 not passed to the parser as NEWLINE tokens,
1244 except *totally* empty lines in interactive
1245 mode, which signal the end of a command group. */
1246 if (col == 0 && c == '\n' && tok->prompt != NULL)
1247 blankline = 0; /* Let it through */
1248 else
1249 blankline = 1; /* Ignore completely */
1250 /* We can't jump back right here since we still
1251 may need to skip to the end of a comment */
1252 }
1253 if (!blankline && tok->level == 0) {
1254 if (col == tok->indstack[tok->indent]) {
1255 /* No change */
1256 if (altcol != tok->altindstack[tok->indent]) {
1257 if (indenterror(tok))
1258 return ERRORTOKEN;
1259 }
1260 }
1261 else if (col > tok->indstack[tok->indent]) {
1262 /* Indent -- always one */
1263 if (tok->indent+1 >= MAXINDENT) {
1264 tok->done = E_TOODEEP;
1265 tok->cur = tok->inp;
1266 return ERRORTOKEN;
1267 }
1268 if (altcol <= tok->altindstack[tok->indent]) {
1269 if (indenterror(tok))
1270 return ERRORTOKEN;
1271 }
1272 tok->pendin++;
1273 tok->indstack[++tok->indent] = col;
1274 tok->altindstack[tok->indent] = altcol;
1275 }
1276 else /* col < tok->indstack[tok->indent] */ {
1277 /* Dedent -- any number, must be consistent */
1278 while (tok->indent > 0 &&
1279 col < tok->indstack[tok->indent]) {
1280 tok->pendin--;
1281 tok->indent--;
1282 }
1283 if (col != tok->indstack[tok->indent]) {
1284 tok->done = E_DEDENT;
1285 tok->cur = tok->inp;
1286 return ERRORTOKEN;
1287 }
1288 if (altcol != tok->altindstack[tok->indent]) {
1289 if (indenterror(tok))
1290 return ERRORTOKEN;
1291 }
1292 }
1293 }
1294 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001295
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001296 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001297
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001298 /* Return pending indents/dedents */
1299 if (tok->pendin != 0) {
1300 if (tok->pendin < 0) {
1301 tok->pendin++;
1302 return DEDENT;
1303 }
1304 else {
1305 tok->pendin--;
1306 return INDENT;
1307 }
1308 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001309
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 again:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001311 tok->start = NULL;
1312 /* Skip spaces */
1313 do {
1314 c = tok_nextc(tok);
1315 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001316
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001317 /* Set start of current token */
1318 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001319
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001320 /* Skip comment, while looking for tab-setting magic */
1321 if (c == '#') {
1322 static char *tabforms[] = {
1323 "tab-width:", /* Emacs */
1324 ":tabstop=", /* vim, full form */
1325 ":ts=", /* vim, abbreviated form */
1326 "set tabsize=", /* will vi never die? */
1327 /* more templates can be added here to support other editors */
1328 };
1329 char cbuf[80];
1330 char *tp, **cp;
1331 tp = cbuf;
1332 do {
1333 *tp++ = c = tok_nextc(tok);
1334 } while (c != EOF && c != '\n' &&
1335 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1336 *tp = '\0';
1337 for (cp = tabforms;
1338 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1339 cp++) {
1340 if ((tp = strstr(cbuf, *cp))) {
1341 int newsize = atoi(tp + strlen(*cp));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001342
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001343 if (newsize >= 1 && newsize <= 40) {
1344 tok->tabsize = newsize;
1345 if (Py_VerboseFlag)
1346 PySys_WriteStderr(
1347 "Tab size set to %d\n",
1348 newsize);
1349 }
1350 }
1351 }
1352 while (c != EOF && c != '\n')
1353 c = tok_nextc(tok);
1354 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001355
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001356 /* Check for EOF and errors now */
1357 if (c == EOF) {
1358 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1359 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001360
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001361 /* Identifier (most frequent token!) */
1362 if (Py_ISALPHA(c) || c == '_') {
1363 /* Process r"", u"" and ur"" */
1364 switch (c) {
1365 case 'b':
1366 case 'B':
1367 c = tok_nextc(tok);
1368 if (c == 'r' || c == 'R')
1369 c = tok_nextc(tok);
1370 if (c == '"' || c == '\'')
1371 goto letter_quote;
1372 break;
1373 case 'r':
1374 case 'R':
1375 c = tok_nextc(tok);
1376 if (c == '"' || c == '\'')
1377 goto letter_quote;
1378 break;
1379 case 'u':
1380 case 'U':
1381 c = tok_nextc(tok);
1382 if (c == 'r' || c == 'R')
1383 c = tok_nextc(tok);
1384 if (c == '"' || c == '\'')
1385 goto letter_quote;
1386 break;
1387 }
Stefan Krah3db41612010-06-24 09:33:05 +00001388 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001389 c = tok_nextc(tok);
1390 }
1391 tok_backup(tok, c);
1392 *p_start = tok->start;
1393 *p_end = tok->cur;
1394 return NAME;
1395 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001396
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001397 /* Newline */
1398 if (c == '\n') {
1399 tok->atbol = 1;
1400 if (blankline || tok->level > 0)
1401 goto nextline;
1402 *p_start = tok->start;
1403 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1404 tok->cont_line = 0;
1405 return NEWLINE;
1406 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001407
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001408 /* Period or number starting with period? */
1409 if (c == '.') {
1410 c = tok_nextc(tok);
1411 if (isdigit(c)) {
1412 goto fraction;
1413 }
1414 else {
1415 tok_backup(tok, c);
1416 *p_start = tok->start;
1417 *p_end = tok->cur;
1418 return DOT;
1419 }
1420 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001421
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001422 /* Number */
1423 if (isdigit(c)) {
1424 if (c == '0') {
1425 /* Hex, octal or binary -- maybe. */
1426 c = tok_nextc(tok);
1427 if (c == '.')
1428 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001429#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001430 if (c == 'j' || c == 'J')
1431 goto imaginary;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001432#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001433 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001434
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001435 /* Hex */
1436 c = tok_nextc(tok);
1437 if (!isxdigit(c)) {
1438 tok->done = E_TOKEN;
1439 tok_backup(tok, c);
1440 return ERRORTOKEN;
1441 }
1442 do {
1443 c = tok_nextc(tok);
1444 } while (isxdigit(c));
1445 }
1446 else if (c == 'o' || c == 'O') {
1447 /* Octal */
1448 c = tok_nextc(tok);
1449 if (c < '0' || c >= '8') {
1450 tok->done = E_TOKEN;
1451 tok_backup(tok, c);
1452 return ERRORTOKEN;
1453 }
1454 do {
1455 c = tok_nextc(tok);
1456 } while ('0' <= c && c < '8');
1457 }
1458 else if (c == 'b' || c == 'B') {
1459 /* Binary */
1460 c = tok_nextc(tok);
1461 if (c != '0' && c != '1') {
1462 tok->done = E_TOKEN;
1463 tok_backup(tok, c);
1464 return ERRORTOKEN;
1465 }
1466 do {
1467 c = tok_nextc(tok);
1468 } while (c == '0' || c == '1');
1469 }
1470 else {
1471 int found_decimal = 0;
1472 /* Octal; c is first char of it */
1473 /* There's no 'isoctdigit' macro, sigh */
1474 while ('0' <= c && c < '8') {
1475 c = tok_nextc(tok);
1476 }
1477 if (isdigit(c)) {
1478 found_decimal = 1;
1479 do {
1480 c = tok_nextc(tok);
1481 } while (isdigit(c));
1482 }
1483 if (c == '.')
1484 goto fraction;
1485 else if (c == 'e' || c == 'E')
1486 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001487#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001488 else if (c == 'j' || c == 'J')
1489 goto imaginary;
Tim Petersd507dab2001-08-30 20:51:59 +00001490#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001491 else if (found_decimal) {
1492 tok->done = E_TOKEN;
1493 tok_backup(tok, c);
1494 return ERRORTOKEN;
1495 }
1496 }
1497 if (c == 'l' || c == 'L')
1498 c = tok_nextc(tok);
1499 }
1500 else {
1501 /* Decimal */
1502 do {
1503 c = tok_nextc(tok);
1504 } while (isdigit(c));
1505 if (c == 'l' || c == 'L')
1506 c = tok_nextc(tok);
1507 else {
1508 /* Accept floating point numbers. */
1509 if (c == '.') {
1510 fraction:
1511 /* Fraction */
1512 do {
1513 c = tok_nextc(tok);
1514 } while (isdigit(c));
1515 }
1516 if (c == 'e' || c == 'E') {
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001517 int e;
1518 exponent:
1519 e = c;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001520 /* Exponent part */
1521 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001522 if (c == '+' || c == '-') {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001523 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001524 if (!isdigit(c)) {
1525 tok->done = E_TOKEN;
1526 tok_backup(tok, c);
1527 return ERRORTOKEN;
1528 }
1529 } else if (!isdigit(c)) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001530 tok_backup(tok, c);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001531 tok_backup(tok, e);
1532 *p_start = tok->start;
1533 *p_end = tok->cur;
1534 return NUMBER;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001535 }
1536 do {
1537 c = tok_nextc(tok);
1538 } while (isdigit(c));
1539 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001540#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001541 if (c == 'j' || c == 'J')
1542 /* Imaginary part */
1543 imaginary:
1544 c = tok_nextc(tok);
Guido van Rossumf595fde1996-01-12 01:31:58 +00001545#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001546 }
1547 }
1548 tok_backup(tok, c);
1549 *p_start = tok->start;
1550 *p_end = tok->cur;
1551 return NUMBER;
1552 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001553
1554 letter_quote:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001555 /* String */
1556 if (c == '\'' || c == '"') {
1557 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1558 int quote = c;
1559 int triple = 0;
1560 int tripcount = 0;
1561 for (;;) {
1562 c = tok_nextc(tok);
1563 if (c == '\n') {
1564 if (!triple) {
1565 tok->done = E_EOLS;
1566 tok_backup(tok, c);
1567 return ERRORTOKEN;
1568 }
1569 tripcount = 0;
1570 tok->cont_line = 1; /* multiline string. */
1571 }
1572 else if (c == EOF) {
1573 if (triple)
1574 tok->done = E_EOFS;
1575 else
1576 tok->done = E_EOLS;
1577 tok->cur = tok->inp;
1578 return ERRORTOKEN;
1579 }
1580 else if (c == quote) {
1581 tripcount++;
1582 if (tok->cur - tok->start == quote2) {
1583 c = tok_nextc(tok);
1584 if (c == quote) {
1585 triple = 1;
1586 tripcount = 0;
1587 continue;
1588 }
1589 tok_backup(tok, c);
1590 }
1591 if (!triple || tripcount == 3)
1592 break;
1593 }
1594 else if (c == '\\') {
1595 tripcount = 0;
1596 c = tok_nextc(tok);
1597 if (c == EOF) {
1598 tok->done = E_EOLS;
1599 tok->cur = tok->inp;
1600 return ERRORTOKEN;
1601 }
1602 }
1603 else
1604 tripcount = 0;
1605 }
1606 *p_start = tok->start;
1607 *p_end = tok->cur;
1608 return STRING;
1609 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001610
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001611 /* Line continuation */
1612 if (c == '\\') {
1613 c = tok_nextc(tok);
1614 if (c != '\n') {
1615 tok->done = E_LINECONT;
1616 tok->cur = tok->inp;
1617 return ERRORTOKEN;
1618 }
1619 tok->cont_line = 1;
1620 goto again; /* Read next line */
1621 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001622
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001623 /* Check for two-character token */
1624 {
1625 int c2 = tok_nextc(tok);
1626 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001627#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001628 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1629 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1630 "<> not supported in 3.x; use !=",
1631 tok->filename, tok->lineno,
1632 NULL, NULL)) {
1633 return ERRORTOKEN;
1634 }
1635 }
Christian Heimes02c9ab52007-11-23 12:12:02 +00001636#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001637 if (token != OP) {
1638 int c3 = tok_nextc(tok);
1639 int token3 = PyToken_ThreeChars(c, c2, c3);
1640 if (token3 != OP) {
1641 token = token3;
1642 } else {
1643 tok_backup(tok, c3);
1644 }
1645 *p_start = tok->start;
1646 *p_end = tok->cur;
1647 return token;
1648 }
1649 tok_backup(tok, c2);
1650 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001651
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001652 /* Keep track of parentheses nesting level */
1653 switch (c) {
1654 case '(':
1655 case '[':
1656 case '{':
1657 tok->level++;
1658 break;
1659 case ')':
1660 case ']':
1661 case '}':
1662 tok->level--;
1663 break;
1664 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001665
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001666 /* Punctuation character */
1667 *p_start = tok->start;
1668 *p_end = tok->cur;
1669 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001670}
1671
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001672int
1673PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1674{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001675 int result = tok_get(tok, p_start, p_end);
1676 if (tok->decoding_erred) {
1677 result = ERRORTOKEN;
1678 tok->done = E_DECODE;
1679 }
1680 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001681}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001682
Martin v. Löwisa5136192007-09-04 14:19:28 +00001683/* This function is only called from parsetok. However, it cannot live
1684 there, as it must be empty for PGEN, and we can check for PGEN only
1685 in this file. */
1686
Christian Heimes082c9b02008-01-23 14:20:50 +00001687#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001688char*
1689PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1690{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001691 return NULL;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001692}
1693#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001694#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001695static PyObject *
1696dec_utf8(const char *enc, const char *text, size_t len) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001697 PyObject *ret = NULL;
1698 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1699 if (unicode_text) {
1700 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1701 Py_DECREF(unicode_text);
1702 }
1703 if (!ret) {
1704 PyErr_Clear();
1705 }
1706 return ret;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001707}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001708char *
1709PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1710{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001711 char *text = NULL;
1712 if (tok->encoding) {
1713 /* convert source to original encondig */
1714 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1715 if (lineobj != NULL) {
1716 int linelen = PyString_Size(lineobj);
1717 const char *line = PyString_AsString(lineobj);
1718 text = PyObject_MALLOC(linelen + 1);
1719 if (text != NULL && line != NULL) {
1720 if (linelen)
1721 strncpy(text, line, linelen);
1722 text[linelen] = '\0';
1723 }
1724 Py_DECREF(lineobj);
1725
1726 /* adjust error offset */
1727 if (*offset > 1) {
1728 PyObject *offsetobj = dec_utf8(tok->encoding,
1729 tok->buf, *offset-1);
1730 if (offsetobj) {
1731 *offset = PyString_Size(offsetobj) + 1;
1732 Py_DECREF(offsetobj);
1733 }
1734 }
1735
1736 }
1737 }
1738 return text;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001739
1740}
Georg Brandl76b30d12008-01-07 18:41:34 +00001741#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001742#endif
1743
Martin v. Löwisa5136192007-09-04 14:19:28 +00001744
Guido van Rossum408027e1996-12-30 16:17:54 +00001745#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001746
1747void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001748tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001749{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001750 printf("%s", _PyParser_TokenNames[type]);
1751 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1752 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001753}
1754
1755#endif