blob: 109c0eee928326c9f7e6aa3e3b54d203d6cf0ecd [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093};
94
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000098tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000130 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131}
132
Benjamin Petersone36199b2009-11-12 23:39:44 +0000133static char *
134new_string(const char *s, Py_ssize_t len)
135{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000142}
143
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000155 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000156}
157
Benjamin Petersone36199b2009-11-12 23:39:44 +0000158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000161 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176
177static char *
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000178get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000230
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000231 begin = t;
232 while (Py_ISALNUM(t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
Benjamin Peterson223546d2015-08-13 21:52:56 -0700238 char* q;
239 if (!r)
240 return NULL;
241 q = get_normal_name(r);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000242 if (r != q) {
243 PyMem_FREE(r);
244 r = new_string(q, strlen(q));
245 }
246 return r;
247 }
248 }
249 }
250 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251}
252
253/* Check whether the line contains a coding spec. If it does,
254 invoke the set_readline function for the new encoding.
255 This function receives the tok_state and the new encoding.
256 Return 1 on success, 0 on failure. */
257
258static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000259check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000260 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000262 char * cs;
263 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300265 if (tok->cont_line) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000266 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300267 tok->read_coding_spec = 1;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000268 return 1;
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300269 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000270 cs = get_coding_spec(line, size);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300271 if (!cs) {
272 Py_ssize_t i;
273 for (i = 0; i < size; i++) {
274 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
275 break;
276 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
277 /* Stop checking coding spec after a line containing
278 * anything except a comment. */
279 tok->read_coding_spec = 1;
280 break;
281 }
282 }
283 } else {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000284 tok->read_coding_spec = 1;
285 if (tok->encoding == NULL) {
286 assert(tok->decoding_state == 1); /* raw */
287 if (strcmp(cs, "utf-8") == 0 ||
288 strcmp(cs, "iso-8859-1") == 0) {
289 tok->encoding = cs;
290 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000291#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000292 r = set_readline(tok, cs);
293 if (r) {
294 tok->encoding = cs;
295 tok->decoding_state = -1;
296 }
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300297 else {
298 PyErr_Format(PyExc_SyntaxError,
299 "encoding problem: %s", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000300 PyMem_FREE(cs);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300301 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000302#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000303 /* Without Unicode support, we cannot
304 process the coding spec. Since there
305 won't be any Unicode literals, that
306 won't matter. */
307 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000308#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000309 }
310 } else { /* then, compare cs with BOM */
311 r = (strcmp(tok->encoding, cs) == 0);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300312 if (!r)
313 PyErr_Format(PyExc_SyntaxError,
314 "encoding problem: %s with BOM", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000315 PyMem_FREE(cs);
316 }
317 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000318 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319}
320
321/* See whether the file starts with a BOM. If it does,
322 invoke the set_readline function with the new encoding.
323 Return 1 on success, 0 on failure. */
324
325static int
326check_bom(int get_char(struct tok_state *),
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000327 void unget_char(int, struct tok_state *),
328 int set_readline(struct tok_state *, const char *),
329 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000330{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000331 int ch1, ch2, ch3;
332 ch1 = get_char(tok);
333 tok->decoding_state = 1;
334 if (ch1 == EOF) {
335 return 1;
336 } else if (ch1 == 0xEF) {
337 ch2 = get_char(tok);
338 if (ch2 != 0xBB) {
339 unget_char(ch2, tok);
340 unget_char(ch1, tok);
341 return 1;
342 }
343 ch3 = get_char(tok);
344 if (ch3 != 0xBF) {
345 unget_char(ch3, tok);
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350#if 0
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000351 /* Disable support for UTF-16 BOMs until a decision
352 is made whether this needs to be supported. */
353 } else if (ch1 == 0xFE) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFF) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-be"))
361 return 0;
362 tok->decoding_state = -1;
363 } else if (ch1 == 0xFF) {
364 ch2 = get_char(tok);
365 if (ch2 != 0xFE) {
366 unget_char(ch2, tok);
367 unget_char(ch1, tok);
368 return 1;
369 }
370 if (!set_readline(tok, "utf-16-le"))
371 return 0;
372 tok->decoding_state = -1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000374 } else {
375 unget_char(ch1, tok);
376 return 1;
377 }
378 if (tok->encoding != NULL)
379 PyMem_FREE(tok->encoding);
380 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
381 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382}
383
384/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000386
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 On entry, tok->decoding_buffer will be one of:
388 1) NULL: need to call tok->decoding_readline to get a new line
389 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000390 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000391 3) PyStringObject *: previous call to fp_readl did not have enough room
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000392 (in the s buffer) to copy entire contents of the line read
393 by tok->decoding_readline. tok->decoding_buffer has the overflow.
394 In this case, fp_readl is called in a loop (with an expanded buffer)
395 until the buffer ends with a '\n' (or until the end of the file is
396 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398
399static char *
400fp_readl(char *s, int size, struct tok_state *tok)
401{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000402#ifndef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000403 /* In a non-Unicode built, this should never be called. */
404 Py_FatalError("fp_readl should not be called in this build.");
405 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000406#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000407 PyObject* utf8 = NULL;
408 PyObject* buf = tok->decoding_buffer;
409 char *str;
410 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000412 /* Ask for one less byte so we can terminate it */
413 assert(size > 0);
414 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000415
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000416 if (buf == NULL) {
417 buf = PyObject_CallObject(tok->decoding_readline, NULL);
418 if (buf == NULL)
419 return error_ret(tok);
Benjamin Peterson22d9ee72013-12-28 10:33:58 -0600420 if (!PyUnicode_Check(buf)) {
421 Py_DECREF(buf);
422 PyErr_SetString(PyExc_SyntaxError,
423 "codec did not return a unicode object");
424 return error_ret(tok);
425 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000426 } else {
427 tok->decoding_buffer = NULL;
428 if (PyString_CheckExact(buf))
429 utf8 = buf;
430 }
431 if (utf8 == NULL) {
432 utf8 = PyUnicode_AsUTF8String(buf);
433 Py_DECREF(buf);
434 if (utf8 == NULL)
435 return error_ret(tok);
436 }
437 str = PyString_AsString(utf8);
438 utf8len = PyString_GET_SIZE(utf8);
439 if (utf8len > size) {
440 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
441 if (tok->decoding_buffer == NULL) {
442 Py_DECREF(utf8);
443 return error_ret(tok);
444 }
445 utf8len = size;
446 }
447 memcpy(s, str, utf8len);
448 s[utf8len] = '\0';
449 Py_DECREF(utf8);
450 if (utf8len == 0)
451 return NULL; /* EOF */
452 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000453#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454}
455
456/* Set the readline function for TOK to a StreamReader's
457 readline function. The StreamReader is named ENC.
458
459 This function is called from check_bom and check_coding_spec.
460
461 ENC is usually identical to the future value of tok->encoding,
462 except for the (currently unsupported) case of UTF-16.
463
464 Return 1 on success, 0 on failure. */
465
466static int
467fp_setreadl(struct tok_state *tok, const char* enc)
468{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000469 PyObject *reader, *stream, *readline;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000471 /* XXX: constify filename argument. */
472 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
473 if (stream == NULL)
474 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000476 reader = PyCodec_StreamReader(enc, stream, NULL);
477 Py_DECREF(stream);
478 if (reader == NULL)
479 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000481 readline = PyObject_GetAttrString(reader, "readline");
482 Py_DECREF(reader);
483 if (readline == NULL)
484 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000486 tok->decoding_readline = readline;
487 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488}
489
490/* Fetch the next byte from TOK. */
491
492static int fp_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000493 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494}
495
496/* Unfetch the last byte back into TOK. */
497
498static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000499 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500}
501
502/* Read a line of input from TOK. Determine encoding
503 if necessary. */
504
505static char *
506decoding_fgets(char *s, int size, struct tok_state *tok)
507{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000508 char *line = NULL;
509 int badchar = 0;
510 for (;;) {
511 if (tok->decoding_state < 0) {
512 /* We already have a codec associated with
513 this input. */
514 line = fp_readl(s, size, tok);
515 break;
516 } else if (tok->decoding_state > 0) {
517 /* We want a 'raw' read. */
518 line = Py_UniversalNewlineFgets(s, size,
519 tok->fp, NULL);
520 break;
521 } else {
522 /* We have not yet determined the encoding.
523 If an encoding is found, use the file-pointer
524 reader functions from now on. */
525 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
526 return error_ret(tok);
527 assert(tok->decoding_state != 0);
528 }
529 }
530 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
531 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
532 return error_ret(tok);
533 }
534 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000536 /* The default encoding is ASCII, so make sure we don't have any
537 non-ASCII bytes in it. */
538 if (line && !tok->encoding) {
539 unsigned char *c;
540 for (c = (unsigned char *)line; *c; c++)
541 if (*c > 127) {
542 badchar = *c;
543 break;
544 }
545 }
546 if (badchar) {
547 char buf[500];
548 /* Need to add 1 to the line number, since this line
549 has not been counted, yet. */
550 sprintf(buf,
551 "Non-ASCII character '\\x%.2x' "
552 "in file %.200s on line %i, "
553 "but no encoding declared; "
Ned Deily24b82092014-06-17 12:24:53 -0700554 "see http://python.org/dev/peps/pep-0263/ for details",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000555 badchar, tok->filename, tok->lineno + 1);
556 PyErr_SetString(PyExc_SyntaxError, buf);
557 return error_ret(tok);
558 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000560 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000561}
562
563static int
564decoding_feof(struct tok_state *tok)
565{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000566 if (tok->decoding_state >= 0) {
567 return feof(tok->fp);
568 } else {
569 PyObject* buf = tok->decoding_buffer;
570 if (buf == NULL) {
571 buf = PyObject_CallObject(tok->decoding_readline, NULL);
572 if (buf == NULL) {
573 error_ret(tok);
574 return 1;
575 } else {
576 tok->decoding_buffer = buf;
577 }
578 }
579 return PyObject_Length(buf) == 0;
580 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581}
582
583/* Fetch a byte from TOK, using the string buffer. */
584
Tim Petersc9d78aa2006-03-26 23:27:58 +0000585static int
586buf_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000587 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588}
589
590/* Unfetch a byte from TOK, using the string buffer. */
591
Tim Petersc9d78aa2006-03-26 23:27:58 +0000592static void
593buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000594 tok->str--;
595 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596}
597
598/* Set the readline function for TOK to ENC. For the string-based
599 tokenizer, this means to just record the encoding. */
600
Tim Petersc9d78aa2006-03-26 23:27:58 +0000601static int
602buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000603 tok->enc = enc;
604 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605}
606
607/* Return a UTF-8 encoding Python string object from the
608 C byte string STR, which is encoded with ENC. */
609
Martin v. Löwis019934b2002-08-07 12:33:18 +0000610#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611static PyObject *
612translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000613 PyObject *utf8;
614 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
615 if (buf == NULL)
616 return NULL;
617 utf8 = PyUnicode_AsUTF8String(buf);
618 Py_DECREF(buf);
619 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000620}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000621#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622
Benjamin Petersone36199b2009-11-12 23:39:44 +0000623
624static char *
625translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000626 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
627 char *buf, *current;
628 char c = '\0';
629 buf = PyMem_MALLOC(needed_length);
630 if (buf == NULL) {
631 tok->done = E_NOMEM;
632 return NULL;
633 }
634 for (current = buf; *s; s++, current++) {
635 c = *s;
636 if (skip_next_lf) {
637 skip_next_lf = 0;
638 if (c == '\n') {
639 c = *++s;
640 if (!c)
641 break;
642 }
643 }
644 if (c == '\r') {
645 skip_next_lf = 1;
646 c = '\n';
647 }
648 *current = c;
649 }
650 /* If this is exec input, add a newline to the end of the string if
651 there isn't one already. */
652 if (exec_input && c != '\n') {
653 *current = '\n';
654 current++;
655 }
656 *current = '\0';
657 final_length = current - buf + 1;
658 if (final_length < needed_length && final_length)
659 /* should never fail */
660 buf = PyMem_REALLOC(buf, final_length);
661 return buf;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000662}
663
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664/* Decode a byte string STR for use as the buffer of TOK.
665 Look for encoding declarations inside STR, and record them
666 inside TOK. */
667
668static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000669decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000671 PyObject* utf8 = NULL;
672 const char *str;
673 const char *s;
674 const char *newl[2] = {NULL, NULL};
675 int lineno = 0;
676 tok->input = str = translate_newlines(input, single, tok);
677 if (str == NULL)
678 return NULL;
679 tok->enc = NULL;
680 tok->str = str;
681 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
682 return error_ret(tok);
683 str = tok->str; /* string after BOM if any */
684 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000685#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000686 if (tok->enc != NULL) {
687 utf8 = translate_into_utf8(str, tok->enc);
688 if (utf8 == NULL)
689 return error_ret(tok);
690 str = PyString_AsString(utf8);
691 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000692#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000693 for (s = str;; s++) {
694 if (*s == '\0') break;
695 else if (*s == '\n') {
696 assert(lineno < 2);
697 newl[lineno] = s;
698 lineno++;
699 if (lineno == 2) break;
700 }
701 }
702 tok->enc = NULL;
703 /* need to check line 1 and 2 separately since check_coding_spec
704 assumes a single line as input */
705 if (newl[0]) {
706 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
707 return error_ret(tok);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300708 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000709 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
710 tok, buf_setreadl))
711 return error_ret(tok);
712 }
713 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000714#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000715 if (tok->enc != NULL) {
716 assert(utf8 == NULL);
717 utf8 = translate_into_utf8(str, tok->enc);
718 if (utf8 == NULL)
719 return error_ret(tok);
720 str = PyString_AsString(utf8);
721 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000722#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000723 assert(tok->decoding_buffer == NULL);
724 tok->decoding_buffer = utf8; /* CAUTION */
725 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000726}
727
728#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729
730/* Set up tokenizer for string */
731
732struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000733PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000734{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000735 struct tok_state *tok = tok_new();
736 if (tok == NULL)
737 return NULL;
738 str = (char *)decode_str(str, exec_input, tok);
739 if (str == NULL) {
740 PyTokenizer_Free(tok);
741 return NULL;
742 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000743
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000744 /* XXX: constify members. */
745 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
746 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747}
748
749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000753PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000755 struct tok_state *tok = tok_new();
756 if (tok == NULL)
757 return NULL;
758 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
759 PyTokenizer_Free(tok);
760 return NULL;
761 }
762 tok->cur = tok->inp = tok->buf;
763 tok->end = tok->buf + BUFSIZ;
764 tok->fp = fp;
765 tok->prompt = ps1;
766 tok->nextprompt = ps2;
767 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768}
769
770
771/* Free a tok_state structure */
772
773void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000774PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000776 if (tok->encoding != NULL)
777 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000778#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000779 Py_XDECREF(tok->decoding_readline);
780 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000781#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000782 if (tok->fp != NULL && tok->buf != NULL)
783 PyMem_FREE(tok->buf);
784 if (tok->input)
785 PyMem_FREE((char *)tok->input);
786 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787}
788
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000789#if !defined(PGEN) && defined(Py_USING_UNICODE)
790static int
791tok_stdin_decode(struct tok_state *tok, char **inp)
792{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000793 PyObject *enc, *sysstdin, *decoded, *utf8;
794 const char *encoding;
795 char *converted;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000796
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000797 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
798 return 0;
799 sysstdin = PySys_GetObject("stdin");
800 if (sysstdin == NULL || !PyFile_Check(sysstdin))
801 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000802
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000803 enc = ((PyFileObject *)sysstdin)->f_encoding;
804 if (enc == NULL || !PyString_Check(enc))
805 return 0;
806 Py_INCREF(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000807
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000808 encoding = PyString_AsString(enc);
809 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
810 if (decoded == NULL)
811 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000812
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000813 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
814 Py_DECREF(decoded);
815 if (utf8 == NULL)
816 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000817
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000818 assert(PyString_Check(utf8));
819 converted = new_string(PyString_AS_STRING(utf8),
820 PyString_GET_SIZE(utf8));
821 Py_DECREF(utf8);
822 if (converted == NULL)
823 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000824
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000825 PyMem_FREE(*inp);
826 *inp = converted;
827 if (tok->encoding != NULL)
828 PyMem_FREE(tok->encoding);
829 tok->encoding = new_string(encoding, strlen(encoding));
830 if (tok->encoding == NULL)
831 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000832
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000833 Py_DECREF(enc);
834 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000835
836error_nomem:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000837 Py_DECREF(enc);
838 tok->done = E_NOMEM;
839 return -1;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000840
841error_clear:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000842 Py_DECREF(enc);
843 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
844 tok->done = E_ERROR;
845 return -1;
846 }
847 /* Fallback to iso-8859-1: for backward compatibility */
848 PyErr_Clear();
849 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000850}
851#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852
853/* Get next char, updating state; error code goes into tok->done */
854
855static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000856tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000858 for (;;) {
859 if (tok->cur != tok->inp) {
860 return Py_CHARMASK(*tok->cur++); /* Fast path */
861 }
862 if (tok->done != E_OK)
863 return EOF;
864 if (tok->fp == NULL) {
865 char *end = strchr(tok->inp, '\n');
866 if (end != NULL)
867 end++;
868 else {
869 end = strchr(tok->inp, '\0');
870 if (end == tok->inp) {
871 tok->done = E_EOF;
872 return EOF;
873 }
874 }
875 if (tok->start == NULL)
876 tok->buf = tok->cur;
877 tok->line_start = tok->cur;
878 tok->lineno++;
879 tok->inp = end;
880 return Py_CHARMASK(*tok->cur++);
881 }
882 if (tok->prompt != NULL) {
883 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
884 if (tok->nextprompt != NULL)
885 tok->prompt = tok->nextprompt;
886 if (newtok == NULL)
887 tok->done = E_INTR;
888 else if (*newtok == '\0') {
889 PyMem_FREE(newtok);
890 tok->done = E_EOF;
891 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000892#if !defined(PGEN) && defined(Py_USING_UNICODE)
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000893 else if (tok_stdin_decode(tok, &newtok) != 0)
894 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000895#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000896 else if (tok->start != NULL) {
897 size_t start = tok->start - tok->buf;
898 size_t oldlen = tok->cur - tok->buf;
899 size_t newlen = oldlen + strlen(newtok);
900 char *buf = tok->buf;
901 buf = (char *)PyMem_REALLOC(buf, newlen+1);
902 tok->lineno++;
903 if (buf == NULL) {
904 PyMem_FREE(tok->buf);
905 tok->buf = NULL;
906 PyMem_FREE(newtok);
907 tok->done = E_NOMEM;
908 return EOF;
909 }
910 tok->buf = buf;
911 tok->cur = tok->buf + oldlen;
912 tok->line_start = tok->cur;
913 strcpy(tok->buf + oldlen, newtok);
914 PyMem_FREE(newtok);
915 tok->inp = tok->buf + newlen;
916 tok->end = tok->inp + 1;
917 tok->start = tok->buf + start;
918 }
919 else {
920 tok->lineno++;
921 if (tok->buf != NULL)
922 PyMem_FREE(tok->buf);
923 tok->buf = newtok;
924 tok->line_start = tok->buf;
925 tok->cur = tok->buf;
926 tok->line_start = tok->buf;
927 tok->inp = strchr(tok->buf, '\0');
928 tok->end = tok->inp + 1;
929 }
930 }
931 else {
932 int done = 0;
933 Py_ssize_t cur = 0;
934 char *pt;
935 if (tok->start == NULL) {
936 if (tok->buf == NULL) {
937 tok->buf = (char *)
938 PyMem_MALLOC(BUFSIZ);
939 if (tok->buf == NULL) {
940 tok->done = E_NOMEM;
941 return EOF;
942 }
943 tok->end = tok->buf + BUFSIZ;
944 }
945 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
946 tok) == NULL) {
947 tok->done = E_EOF;
948 done = 1;
949 }
950 else {
951 tok->done = E_OK;
952 tok->inp = strchr(tok->buf, '\0');
953 done = tok->inp[-1] == '\n';
954 }
955 }
956 else {
957 cur = tok->cur - tok->buf;
958 if (decoding_feof(tok)) {
959 tok->done = E_EOF;
960 done = 1;
961 }
962 else
963 tok->done = E_OK;
964 }
965 tok->lineno++;
966 /* Read until '\n' or EOF */
967 while (!done) {
968 Py_ssize_t curstart = tok->start == NULL ? -1 :
969 tok->start - tok->buf;
970 Py_ssize_t curvalid = tok->inp - tok->buf;
971 Py_ssize_t newsize = curvalid + BUFSIZ;
972 char *newbuf = tok->buf;
973 newbuf = (char *)PyMem_REALLOC(newbuf,
974 newsize);
975 if (newbuf == NULL) {
976 tok->done = E_NOMEM;
977 tok->cur = tok->inp;
978 return EOF;
979 }
980 tok->buf = newbuf;
981 tok->inp = tok->buf + curvalid;
982 tok->end = tok->buf + newsize;
983 tok->start = curstart < 0 ? NULL :
984 tok->buf + curstart;
985 if (decoding_fgets(tok->inp,
986 (int)(tok->end - tok->inp),
987 tok) == NULL) {
988 /* Break out early on decoding
989 errors, as tok->buf will be NULL
990 */
991 if (tok->decoding_erred)
992 return EOF;
993 /* Last line does not end in \n,
994 fake one */
995 strcpy(tok->inp, "\n");
996 }
997 tok->inp = strchr(tok->inp, '\0');
998 done = tok->inp[-1] == '\n';
999 }
1000 if (tok->buf != NULL) {
1001 tok->cur = tok->buf + cur;
1002 tok->line_start = tok->cur;
1003 /* replace "\r\n" with "\n" */
1004 /* For Mac leave the \r, giving a syntax error */
1005 pt = tok->inp - 2;
1006 if (pt >= tok->buf && *pt == '\r') {
1007 *pt++ = '\n';
1008 *pt = '\0';
1009 tok->inp = pt;
1010 }
1011 }
1012 }
1013 if (tok->done != E_OK) {
1014 if (tok->prompt != NULL)
1015 PySys_WriteStderr("\n");
1016 tok->cur = tok->inp;
1017 return EOF;
1018 }
1019 }
1020 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001021}
1022
1023
1024/* Back-up one character */
1025
1026static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001027tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001028{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001029 if (c != EOF) {
1030 if (--tok->cur < tok->buf)
1031 Py_FatalError("tok_backup: beginning of buffer");
1032 if (*tok->cur != c)
1033 *tok->cur = c;
1034 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001035}
1036
1037
1038/* Return the token corresponding to a single character */
1039
1040int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001041PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001042{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001043 switch (c) {
1044 case '(': return LPAR;
1045 case ')': return RPAR;
1046 case '[': return LSQB;
1047 case ']': return RSQB;
1048 case ':': return COLON;
1049 case ',': return COMMA;
1050 case ';': return SEMI;
1051 case '+': return PLUS;
1052 case '-': return MINUS;
1053 case '*': return STAR;
1054 case '/': return SLASH;
1055 case '|': return VBAR;
1056 case '&': return AMPER;
1057 case '<': return LESS;
1058 case '>': return GREATER;
1059 case '=': return EQUAL;
1060 case '.': return DOT;
1061 case '%': return PERCENT;
1062 case '`': return BACKQUOTE;
1063 case '{': return LBRACE;
1064 case '}': return RBRACE;
1065 case '^': return CIRCUMFLEX;
1066 case '~': return TILDE;
1067 case '@': return AT;
1068 default: return OP;
1069 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001070}
1071
1072
Guido van Rossumfbab9051991-10-20 20:25:03 +00001073int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001074PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001075{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001076 switch (c1) {
1077 case '=':
1078 switch (c2) {
1079 case '=': return EQEQUAL;
1080 }
1081 break;
1082 case '!':
1083 switch (c2) {
1084 case '=': return NOTEQUAL;
1085 }
1086 break;
1087 case '<':
1088 switch (c2) {
1089 case '>': return NOTEQUAL;
1090 case '=': return LESSEQUAL;
1091 case '<': return LEFTSHIFT;
1092 }
1093 break;
1094 case '>':
1095 switch (c2) {
1096 case '=': return GREATEREQUAL;
1097 case '>': return RIGHTSHIFT;
1098 }
1099 break;
1100 case '+':
1101 switch (c2) {
1102 case '=': return PLUSEQUAL;
1103 }
1104 break;
1105 case '-':
1106 switch (c2) {
1107 case '=': return MINEQUAL;
1108 }
1109 break;
1110 case '*':
1111 switch (c2) {
1112 case '*': return DOUBLESTAR;
1113 case '=': return STAREQUAL;
1114 }
1115 break;
1116 case '/':
1117 switch (c2) {
1118 case '/': return DOUBLESLASH;
1119 case '=': return SLASHEQUAL;
1120 }
1121 break;
1122 case '|':
1123 switch (c2) {
1124 case '=': return VBAREQUAL;
1125 }
1126 break;
1127 case '%':
1128 switch (c2) {
1129 case '=': return PERCENTEQUAL;
1130 }
1131 break;
1132 case '&':
1133 switch (c2) {
1134 case '=': return AMPEREQUAL;
1135 }
1136 break;
1137 case '^':
1138 switch (c2) {
1139 case '=': return CIRCUMFLEXEQUAL;
1140 }
1141 break;
1142 }
1143 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001144}
1145
Thomas Wouters434d0822000-08-24 20:11:32 +00001146int
1147PyToken_ThreeChars(int c1, int c2, int c3)
1148{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001149 switch (c1) {
1150 case '<':
1151 switch (c2) {
1152 case '<':
1153 switch (c3) {
1154 case '=':
1155 return LEFTSHIFTEQUAL;
1156 }
1157 break;
1158 }
1159 break;
1160 case '>':
1161 switch (c2) {
1162 case '>':
1163 switch (c3) {
1164 case '=':
1165 return RIGHTSHIFTEQUAL;
1166 }
1167 break;
1168 }
1169 break;
1170 case '*':
1171 switch (c2) {
1172 case '*':
1173 switch (c3) {
1174 case '=':
1175 return DOUBLESTAREQUAL;
1176 }
1177 break;
1178 }
1179 break;
1180 case '/':
1181 switch (c2) {
1182 case '/':
1183 switch (c3) {
1184 case '=':
1185 return DOUBLESLASHEQUAL;
1186 }
1187 break;
1188 }
1189 break;
1190 }
1191 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001192}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001193
Guido van Rossum926f13a1998-04-09 21:38:06 +00001194static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001195indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001196{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001197 if (tok->alterror) {
1198 tok->done = E_TABSPACE;
1199 tok->cur = tok->inp;
1200 return 1;
1201 }
1202 if (tok->altwarning) {
1203 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1204 "in indentation\n", tok->filename);
1205 tok->altwarning = 0;
1206 }
1207 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001208}
1209
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001210/* Get next token, after space stripping etc. */
1211
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001212static int
1213tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001214{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001215 register int c;
1216 int blankline;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001217
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001218 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001219 nextline:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001220 tok->start = NULL;
1221 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001222
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001223 /* Get indentation level */
1224 if (tok->atbol) {
1225 register int col = 0;
1226 register int altcol = 0;
1227 tok->atbol = 0;
1228 for (;;) {
1229 c = tok_nextc(tok);
1230 if (c == ' ')
1231 col++, altcol++;
1232 else if (c == '\t') {
1233 col = (col/tok->tabsize + 1) * tok->tabsize;
1234 altcol = (altcol/tok->alttabsize + 1)
1235 * tok->alttabsize;
1236 }
1237 else if (c == '\014') /* Control-L (formfeed) */
1238 col = altcol = 0; /* For Emacs users */
1239 else
1240 break;
1241 }
1242 tok_backup(tok, c);
1243 if (c == '#' || c == '\n') {
1244 /* Lines with only whitespace and/or comments
1245 shouldn't affect the indentation and are
1246 not passed to the parser as NEWLINE tokens,
1247 except *totally* empty lines in interactive
1248 mode, which signal the end of a command group. */
1249 if (col == 0 && c == '\n' && tok->prompt != NULL)
1250 blankline = 0; /* Let it through */
1251 else
1252 blankline = 1; /* Ignore completely */
1253 /* We can't jump back right here since we still
1254 may need to skip to the end of a comment */
1255 }
1256 if (!blankline && tok->level == 0) {
1257 if (col == tok->indstack[tok->indent]) {
1258 /* No change */
1259 if (altcol != tok->altindstack[tok->indent]) {
1260 if (indenterror(tok))
1261 return ERRORTOKEN;
1262 }
1263 }
1264 else if (col > tok->indstack[tok->indent]) {
1265 /* Indent -- always one */
1266 if (tok->indent+1 >= MAXINDENT) {
1267 tok->done = E_TOODEEP;
1268 tok->cur = tok->inp;
1269 return ERRORTOKEN;
1270 }
1271 if (altcol <= tok->altindstack[tok->indent]) {
1272 if (indenterror(tok))
1273 return ERRORTOKEN;
1274 }
1275 tok->pendin++;
1276 tok->indstack[++tok->indent] = col;
1277 tok->altindstack[tok->indent] = altcol;
1278 }
1279 else /* col < tok->indstack[tok->indent] */ {
1280 /* Dedent -- any number, must be consistent */
1281 while (tok->indent > 0 &&
1282 col < tok->indstack[tok->indent]) {
1283 tok->pendin--;
1284 tok->indent--;
1285 }
1286 if (col != tok->indstack[tok->indent]) {
1287 tok->done = E_DEDENT;
1288 tok->cur = tok->inp;
1289 return ERRORTOKEN;
1290 }
1291 if (altcol != tok->altindstack[tok->indent]) {
1292 if (indenterror(tok))
1293 return ERRORTOKEN;
1294 }
1295 }
1296 }
1297 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001298
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001299 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001300
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001301 /* Return pending indents/dedents */
1302 if (tok->pendin != 0) {
1303 if (tok->pendin < 0) {
1304 tok->pendin++;
1305 return DEDENT;
1306 }
1307 else {
1308 tok->pendin--;
1309 return INDENT;
1310 }
1311 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001312
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 again:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001314 tok->start = NULL;
1315 /* Skip spaces */
1316 do {
1317 c = tok_nextc(tok);
1318 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001319
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001320 /* Set start of current token */
1321 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001322
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001323 /* Skip comment, while looking for tab-setting magic */
1324 if (c == '#') {
1325 static char *tabforms[] = {
1326 "tab-width:", /* Emacs */
1327 ":tabstop=", /* vim, full form */
1328 ":ts=", /* vim, abbreviated form */
1329 "set tabsize=", /* will vi never die? */
1330 /* more templates can be added here to support other editors */
1331 };
1332 char cbuf[80];
1333 char *tp, **cp;
1334 tp = cbuf;
1335 do {
1336 *tp++ = c = tok_nextc(tok);
1337 } while (c != EOF && c != '\n' &&
1338 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1339 *tp = '\0';
1340 for (cp = tabforms;
1341 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1342 cp++) {
1343 if ((tp = strstr(cbuf, *cp))) {
1344 int newsize = atoi(tp + strlen(*cp));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001345
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001346 if (newsize >= 1 && newsize <= 40) {
1347 tok->tabsize = newsize;
1348 if (Py_VerboseFlag)
1349 PySys_WriteStderr(
1350 "Tab size set to %d\n",
1351 newsize);
1352 }
1353 }
1354 }
1355 while (c != EOF && c != '\n')
1356 c = tok_nextc(tok);
1357 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001358
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001359 /* Check for EOF and errors now */
1360 if (c == EOF) {
1361 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1362 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001363
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001364 /* Identifier (most frequent token!) */
1365 if (Py_ISALPHA(c) || c == '_') {
1366 /* Process r"", u"" and ur"" */
1367 switch (c) {
1368 case 'b':
1369 case 'B':
1370 c = tok_nextc(tok);
1371 if (c == 'r' || c == 'R')
1372 c = tok_nextc(tok);
1373 if (c == '"' || c == '\'')
1374 goto letter_quote;
1375 break;
1376 case 'r':
1377 case 'R':
1378 c = tok_nextc(tok);
1379 if (c == '"' || c == '\'')
1380 goto letter_quote;
1381 break;
1382 case 'u':
1383 case 'U':
1384 c = tok_nextc(tok);
1385 if (c == 'r' || c == 'R')
1386 c = tok_nextc(tok);
1387 if (c == '"' || c == '\'')
1388 goto letter_quote;
1389 break;
1390 }
Stefan Krah3db41612010-06-24 09:33:05 +00001391 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001392 c = tok_nextc(tok);
1393 }
1394 tok_backup(tok, c);
1395 *p_start = tok->start;
1396 *p_end = tok->cur;
1397 return NAME;
1398 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001399
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001400 /* Newline */
1401 if (c == '\n') {
1402 tok->atbol = 1;
1403 if (blankline || tok->level > 0)
1404 goto nextline;
1405 *p_start = tok->start;
1406 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1407 tok->cont_line = 0;
1408 return NEWLINE;
1409 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001410
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001411 /* Period or number starting with period? */
1412 if (c == '.') {
1413 c = tok_nextc(tok);
1414 if (isdigit(c)) {
1415 goto fraction;
1416 }
1417 else {
1418 tok_backup(tok, c);
1419 *p_start = tok->start;
1420 *p_end = tok->cur;
1421 return DOT;
1422 }
1423 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001424
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001425 /* Number */
1426 if (isdigit(c)) {
1427 if (c == '0') {
1428 /* Hex, octal or binary -- maybe. */
1429 c = tok_nextc(tok);
1430 if (c == '.')
1431 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001432#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001433 if (c == 'j' || c == 'J')
1434 goto imaginary;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001435#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001436 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001437
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001438 /* Hex */
1439 c = tok_nextc(tok);
1440 if (!isxdigit(c)) {
1441 tok->done = E_TOKEN;
1442 tok_backup(tok, c);
1443 return ERRORTOKEN;
1444 }
1445 do {
1446 c = tok_nextc(tok);
1447 } while (isxdigit(c));
1448 }
1449 else if (c == 'o' || c == 'O') {
1450 /* Octal */
1451 c = tok_nextc(tok);
1452 if (c < '0' || c >= '8') {
1453 tok->done = E_TOKEN;
1454 tok_backup(tok, c);
1455 return ERRORTOKEN;
1456 }
1457 do {
1458 c = tok_nextc(tok);
1459 } while ('0' <= c && c < '8');
1460 }
1461 else if (c == 'b' || c == 'B') {
1462 /* Binary */
1463 c = tok_nextc(tok);
1464 if (c != '0' && c != '1') {
1465 tok->done = E_TOKEN;
1466 tok_backup(tok, c);
1467 return ERRORTOKEN;
1468 }
1469 do {
1470 c = tok_nextc(tok);
1471 } while (c == '0' || c == '1');
1472 }
1473 else {
1474 int found_decimal = 0;
1475 /* Octal; c is first char of it */
1476 /* There's no 'isoctdigit' macro, sigh */
1477 while ('0' <= c && c < '8') {
1478 c = tok_nextc(tok);
1479 }
1480 if (isdigit(c)) {
1481 found_decimal = 1;
1482 do {
1483 c = tok_nextc(tok);
1484 } while (isdigit(c));
1485 }
1486 if (c == '.')
1487 goto fraction;
1488 else if (c == 'e' || c == 'E')
1489 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001490#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001491 else if (c == 'j' || c == 'J')
1492 goto imaginary;
Tim Petersd507dab2001-08-30 20:51:59 +00001493#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001494 else if (found_decimal) {
1495 tok->done = E_TOKEN;
1496 tok_backup(tok, c);
1497 return ERRORTOKEN;
1498 }
1499 }
1500 if (c == 'l' || c == 'L')
1501 c = tok_nextc(tok);
1502 }
1503 else {
1504 /* Decimal */
1505 do {
1506 c = tok_nextc(tok);
1507 } while (isdigit(c));
1508 if (c == 'l' || c == 'L')
1509 c = tok_nextc(tok);
1510 else {
1511 /* Accept floating point numbers. */
1512 if (c == '.') {
1513 fraction:
1514 /* Fraction */
1515 do {
1516 c = tok_nextc(tok);
1517 } while (isdigit(c));
1518 }
1519 if (c == 'e' || c == 'E') {
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001520 int e;
1521 exponent:
1522 e = c;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001523 /* Exponent part */
1524 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001525 if (c == '+' || c == '-') {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001526 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001527 if (!isdigit(c)) {
1528 tok->done = E_TOKEN;
1529 tok_backup(tok, c);
1530 return ERRORTOKEN;
1531 }
1532 } else if (!isdigit(c)) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001533 tok_backup(tok, c);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001534 tok_backup(tok, e);
1535 *p_start = tok->start;
1536 *p_end = tok->cur;
1537 return NUMBER;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001538 }
1539 do {
1540 c = tok_nextc(tok);
1541 } while (isdigit(c));
1542 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001543#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001544 if (c == 'j' || c == 'J')
1545 /* Imaginary part */
1546 imaginary:
1547 c = tok_nextc(tok);
Guido van Rossumf595fde1996-01-12 01:31:58 +00001548#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001549 }
1550 }
1551 tok_backup(tok, c);
1552 *p_start = tok->start;
1553 *p_end = tok->cur;
1554 return NUMBER;
1555 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001556
1557 letter_quote:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001558 /* String */
1559 if (c == '\'' || c == '"') {
1560 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1561 int quote = c;
1562 int triple = 0;
1563 int tripcount = 0;
1564 for (;;) {
1565 c = tok_nextc(tok);
1566 if (c == '\n') {
1567 if (!triple) {
1568 tok->done = E_EOLS;
1569 tok_backup(tok, c);
1570 return ERRORTOKEN;
1571 }
1572 tripcount = 0;
1573 tok->cont_line = 1; /* multiline string. */
1574 }
1575 else if (c == EOF) {
1576 if (triple)
1577 tok->done = E_EOFS;
1578 else
1579 tok->done = E_EOLS;
1580 tok->cur = tok->inp;
1581 return ERRORTOKEN;
1582 }
1583 else if (c == quote) {
1584 tripcount++;
1585 if (tok->cur - tok->start == quote2) {
1586 c = tok_nextc(tok);
1587 if (c == quote) {
1588 triple = 1;
1589 tripcount = 0;
1590 continue;
1591 }
1592 tok_backup(tok, c);
1593 }
1594 if (!triple || tripcount == 3)
1595 break;
1596 }
1597 else if (c == '\\') {
1598 tripcount = 0;
1599 c = tok_nextc(tok);
1600 if (c == EOF) {
1601 tok->done = E_EOLS;
1602 tok->cur = tok->inp;
1603 return ERRORTOKEN;
1604 }
1605 }
1606 else
1607 tripcount = 0;
1608 }
1609 *p_start = tok->start;
1610 *p_end = tok->cur;
1611 return STRING;
1612 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001613
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001614 /* Line continuation */
1615 if (c == '\\') {
1616 c = tok_nextc(tok);
1617 if (c != '\n') {
1618 tok->done = E_LINECONT;
1619 tok->cur = tok->inp;
1620 return ERRORTOKEN;
1621 }
1622 tok->cont_line = 1;
1623 goto again; /* Read next line */
1624 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001625
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001626 /* Check for two-character token */
1627 {
1628 int c2 = tok_nextc(tok);
1629 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001630#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001631 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1632 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1633 "<> not supported in 3.x; use !=",
1634 tok->filename, tok->lineno,
1635 NULL, NULL)) {
1636 return ERRORTOKEN;
1637 }
1638 }
Christian Heimes02c9ab52007-11-23 12:12:02 +00001639#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001640 if (token != OP) {
1641 int c3 = tok_nextc(tok);
1642 int token3 = PyToken_ThreeChars(c, c2, c3);
1643 if (token3 != OP) {
1644 token = token3;
1645 } else {
1646 tok_backup(tok, c3);
1647 }
1648 *p_start = tok->start;
1649 *p_end = tok->cur;
1650 return token;
1651 }
1652 tok_backup(tok, c2);
1653 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001654
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001655 /* Keep track of parentheses nesting level */
1656 switch (c) {
1657 case '(':
1658 case '[':
1659 case '{':
1660 tok->level++;
1661 break;
1662 case ')':
1663 case ']':
1664 case '}':
1665 tok->level--;
1666 break;
1667 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001668
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001669 /* Punctuation character */
1670 *p_start = tok->start;
1671 *p_end = tok->cur;
1672 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001673}
1674
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001675int
1676PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1677{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001678 int result = tok_get(tok, p_start, p_end);
1679 if (tok->decoding_erred) {
1680 result = ERRORTOKEN;
1681 tok->done = E_DECODE;
1682 }
1683 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001684}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001685
Martin v. Löwisa5136192007-09-04 14:19:28 +00001686/* This function is only called from parsetok. However, it cannot live
1687 there, as it must be empty for PGEN, and we can check for PGEN only
1688 in this file. */
1689
Christian Heimes082c9b02008-01-23 14:20:50 +00001690#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001691char*
1692PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1693{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001694 return NULL;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001695}
1696#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001697#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001698static PyObject *
1699dec_utf8(const char *enc, const char *text, size_t len) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001700 PyObject *ret = NULL;
1701 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1702 if (unicode_text) {
1703 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1704 Py_DECREF(unicode_text);
1705 }
1706 if (!ret) {
1707 PyErr_Clear();
1708 }
1709 return ret;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001710}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001711char *
1712PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1713{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001714 char *text = NULL;
1715 if (tok->encoding) {
1716 /* convert source to original encondig */
1717 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1718 if (lineobj != NULL) {
1719 int linelen = PyString_Size(lineobj);
1720 const char *line = PyString_AsString(lineobj);
1721 text = PyObject_MALLOC(linelen + 1);
1722 if (text != NULL && line != NULL) {
1723 if (linelen)
1724 strncpy(text, line, linelen);
1725 text[linelen] = '\0';
1726 }
1727 Py_DECREF(lineobj);
1728
1729 /* adjust error offset */
1730 if (*offset > 1) {
1731 PyObject *offsetobj = dec_utf8(tok->encoding,
1732 tok->buf, *offset-1);
1733 if (offsetobj) {
1734 *offset = PyString_Size(offsetobj) + 1;
1735 Py_DECREF(offsetobj);
1736 }
1737 }
1738
1739 }
1740 }
1741 return text;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001742
1743}
Georg Brandl76b30d12008-01-07 18:41:34 +00001744#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001745#endif
1746
Martin v. Löwisa5136192007-09-04 14:19:28 +00001747
Guido van Rossum408027e1996-12-30 16:17:54 +00001748#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001749
1750void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001751tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001752{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001753 printf("%s", _PyParser_TokenNames[type]);
1754 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1755 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001756}
1757
1758#endif