blob: 16d311c83d77bcc23ede20e1218f664aaf0644ed [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093};
94
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000098tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000130 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131}
132
Benjamin Petersone36199b2009-11-12 23:39:44 +0000133static char *
134new_string(const char *s, Py_ssize_t len)
135{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000142}
143
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000155 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000156}
157
Benjamin Petersone36199b2009-11-12 23:39:44 +0000158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000161 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176
177static char *
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000178get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000230
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000231 begin = t;
232 while (Py_ISALNUM(t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000257 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000259 char * cs;
260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
265 cs = get_coding_spec(line, size);
266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300280 else {
281 PyErr_Format(PyExc_SyntaxError,
282 "encoding problem: %s", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000283 PyMem_FREE(cs);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300284 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000285#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000286 /* Without Unicode support, we cannot
287 process the coding spec. Since there
288 won't be any Unicode literals, that
289 won't matter. */
290 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000291#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000292 }
293 } else { /* then, compare cs with BOM */
294 r = (strcmp(tok->encoding, cs) == 0);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300295 if (!r)
296 PyErr_Format(PyExc_SyntaxError,
297 "encoding problem: %s with BOM", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000298 PyMem_FREE(cs);
299 }
300 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000301 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000313{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000314 int ch1, ch2, ch3;
315 ch1 = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch1 == EOF) {
318 return 1;
319 } else if (ch1 == 0xEF) {
320 ch2 = get_char(tok);
321 if (ch2 != 0xBB) {
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
326 ch3 = get_char(tok);
327 if (ch3 != 0xBF) {
328 unget_char(ch3, tok);
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#if 0
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch1 == 0xFE) {
337 ch2 = get_char(tok);
338 if (ch2 != 0xFF) {
339 unget_char(ch2, tok);
340 unget_char(ch1, tok);
341 return 1;
342 }
343 if (!set_readline(tok, "utf-16-be"))
344 return 0;
345 tok->decoding_state = -1;
346 } else if (ch1 == 0xFF) {
347 ch2 = get_char(tok);
348 if (ch2 != 0xFE) {
349 unget_char(ch2, tok);
350 unget_char(ch1, tok);
351 return 1;
352 }
353 if (!set_readline(tok, "utf-16-le"))
354 return 0;
355 tok->decoding_state = -1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000357 } else {
358 unget_char(ch1, tok);
359 return 1;
360 }
361 if (tok->encoding != NULL)
362 PyMem_FREE(tok->encoding);
363 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
364 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365}
366
367/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000369
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 On entry, tok->decoding_buffer will be one of:
371 1) NULL: need to call tok->decoding_readline to get a new line
372 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000373 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000374 3) PyStringObject *: previous call to fp_readl did not have enough room
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000375 (in the s buffer) to copy entire contents of the line read
376 by tok->decoding_readline. tok->decoding_buffer has the overflow.
377 In this case, fp_readl is called in a loop (with an expanded buffer)
378 until the buffer ends with a '\n' (or until the end of the file is
379 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381
382static char *
383fp_readl(char *s, int size, struct tok_state *tok)
384{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000385#ifndef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000386 /* In a non-Unicode built, this should never be called. */
387 Py_FatalError("fp_readl should not be called in this build.");
388 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000389#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000390 PyObject* utf8 = NULL;
391 PyObject* buf = tok->decoding_buffer;
392 char *str;
393 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000395 /* Ask for one less byte so we can terminate it */
396 assert(size > 0);
397 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000398
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000399 if (buf == NULL) {
400 buf = PyObject_CallObject(tok->decoding_readline, NULL);
401 if (buf == NULL)
402 return error_ret(tok);
Benjamin Peterson22d9ee72013-12-28 10:33:58 -0600403 if (!PyUnicode_Check(buf)) {
404 Py_DECREF(buf);
405 PyErr_SetString(PyExc_SyntaxError,
406 "codec did not return a unicode object");
407 return error_ret(tok);
408 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000409 } else {
410 tok->decoding_buffer = NULL;
411 if (PyString_CheckExact(buf))
412 utf8 = buf;
413 }
414 if (utf8 == NULL) {
415 utf8 = PyUnicode_AsUTF8String(buf);
416 Py_DECREF(buf);
417 if (utf8 == NULL)
418 return error_ret(tok);
419 }
420 str = PyString_AsString(utf8);
421 utf8len = PyString_GET_SIZE(utf8);
422 if (utf8len > size) {
423 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
424 if (tok->decoding_buffer == NULL) {
425 Py_DECREF(utf8);
426 return error_ret(tok);
427 }
428 utf8len = size;
429 }
430 memcpy(s, str, utf8len);
431 s[utf8len] = '\0';
432 Py_DECREF(utf8);
433 if (utf8len == 0)
434 return NULL; /* EOF */
435 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000436#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437}
438
439/* Set the readline function for TOK to a StreamReader's
440 readline function. The StreamReader is named ENC.
441
442 This function is called from check_bom and check_coding_spec.
443
444 ENC is usually identical to the future value of tok->encoding,
445 except for the (currently unsupported) case of UTF-16.
446
447 Return 1 on success, 0 on failure. */
448
449static int
450fp_setreadl(struct tok_state *tok, const char* enc)
451{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000452 PyObject *reader, *stream, *readline;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000454 /* XXX: constify filename argument. */
455 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
456 if (stream == NULL)
457 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000459 reader = PyCodec_StreamReader(enc, stream, NULL);
460 Py_DECREF(stream);
461 if (reader == NULL)
462 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000464 readline = PyObject_GetAttrString(reader, "readline");
465 Py_DECREF(reader);
466 if (readline == NULL)
467 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000469 tok->decoding_readline = readline;
470 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471}
472
473/* Fetch the next byte from TOK. */
474
475static int fp_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000476 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477}
478
479/* Unfetch the last byte back into TOK. */
480
481static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000482 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000483}
484
485/* Read a line of input from TOK. Determine encoding
486 if necessary. */
487
488static char *
489decoding_fgets(char *s, int size, struct tok_state *tok)
490{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000491 char *line = NULL;
492 int badchar = 0;
493 for (;;) {
494 if (tok->decoding_state < 0) {
495 /* We already have a codec associated with
496 this input. */
497 line = fp_readl(s, size, tok);
498 break;
499 } else if (tok->decoding_state > 0) {
500 /* We want a 'raw' read. */
501 line = Py_UniversalNewlineFgets(s, size,
502 tok->fp, NULL);
503 break;
504 } else {
505 /* We have not yet determined the encoding.
506 If an encoding is found, use the file-pointer
507 reader functions from now on. */
508 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
509 return error_ret(tok);
510 assert(tok->decoding_state != 0);
511 }
512 }
513 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
514 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
515 return error_ret(tok);
516 }
517 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000518#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000519 /* The default encoding is ASCII, so make sure we don't have any
520 non-ASCII bytes in it. */
521 if (line && !tok->encoding) {
522 unsigned char *c;
523 for (c = (unsigned char *)line; *c; c++)
524 if (*c > 127) {
525 badchar = *c;
526 break;
527 }
528 }
529 if (badchar) {
530 char buf[500];
531 /* Need to add 1 to the line number, since this line
532 has not been counted, yet. */
533 sprintf(buf,
534 "Non-ASCII character '\\x%.2x' "
535 "in file %.200s on line %i, "
536 "but no encoding declared; "
537 "see http://www.python.org/peps/pep-0263.html for details",
538 badchar, tok->filename, tok->lineno + 1);
539 PyErr_SetString(PyExc_SyntaxError, buf);
540 return error_ret(tok);
541 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000543 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544}
545
546static int
547decoding_feof(struct tok_state *tok)
548{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000549 if (tok->decoding_state >= 0) {
550 return feof(tok->fp);
551 } else {
552 PyObject* buf = tok->decoding_buffer;
553 if (buf == NULL) {
554 buf = PyObject_CallObject(tok->decoding_readline, NULL);
555 if (buf == NULL) {
556 error_ret(tok);
557 return 1;
558 } else {
559 tok->decoding_buffer = buf;
560 }
561 }
562 return PyObject_Length(buf) == 0;
563 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000564}
565
566/* Fetch a byte from TOK, using the string buffer. */
567
Tim Petersc9d78aa2006-03-26 23:27:58 +0000568static int
569buf_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000570 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571}
572
573/* Unfetch a byte from TOK, using the string buffer. */
574
Tim Petersc9d78aa2006-03-26 23:27:58 +0000575static void
576buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000577 tok->str--;
578 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579}
580
581/* Set the readline function for TOK to ENC. For the string-based
582 tokenizer, this means to just record the encoding. */
583
Tim Petersc9d78aa2006-03-26 23:27:58 +0000584static int
585buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000586 tok->enc = enc;
587 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588}
589
590/* Return a UTF-8 encoding Python string object from the
591 C byte string STR, which is encoded with ENC. */
592
Martin v. Löwis019934b2002-08-07 12:33:18 +0000593#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594static PyObject *
595translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000596 PyObject *utf8;
597 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
598 if (buf == NULL)
599 return NULL;
600 utf8 = PyUnicode_AsUTF8String(buf);
601 Py_DECREF(buf);
602 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000604#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605
Benjamin Petersone36199b2009-11-12 23:39:44 +0000606
607static char *
608translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000609 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
610 char *buf, *current;
611 char c = '\0';
612 buf = PyMem_MALLOC(needed_length);
613 if (buf == NULL) {
614 tok->done = E_NOMEM;
615 return NULL;
616 }
617 for (current = buf; *s; s++, current++) {
618 c = *s;
619 if (skip_next_lf) {
620 skip_next_lf = 0;
621 if (c == '\n') {
622 c = *++s;
623 if (!c)
624 break;
625 }
626 }
627 if (c == '\r') {
628 skip_next_lf = 1;
629 c = '\n';
630 }
631 *current = c;
632 }
633 /* If this is exec input, add a newline to the end of the string if
634 there isn't one already. */
635 if (exec_input && c != '\n') {
636 *current = '\n';
637 current++;
638 }
639 *current = '\0';
640 final_length = current - buf + 1;
641 if (final_length < needed_length && final_length)
642 /* should never fail */
643 buf = PyMem_REALLOC(buf, final_length);
644 return buf;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000645}
646
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000647/* Decode a byte string STR for use as the buffer of TOK.
648 Look for encoding declarations inside STR, and record them
649 inside TOK. */
650
651static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000652decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000654 PyObject* utf8 = NULL;
655 const char *str;
656 const char *s;
657 const char *newl[2] = {NULL, NULL};
658 int lineno = 0;
659 tok->input = str = translate_newlines(input, single, tok);
660 if (str == NULL)
661 return NULL;
662 tok->enc = NULL;
663 tok->str = str;
664 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
665 return error_ret(tok);
666 str = tok->str; /* string after BOM if any */
667 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000668#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000669 if (tok->enc != NULL) {
670 utf8 = translate_into_utf8(str, tok->enc);
671 if (utf8 == NULL)
672 return error_ret(tok);
673 str = PyString_AsString(utf8);
674 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000675#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000676 for (s = str;; s++) {
677 if (*s == '\0') break;
678 else if (*s == '\n') {
679 assert(lineno < 2);
680 newl[lineno] = s;
681 lineno++;
682 if (lineno == 2) break;
683 }
684 }
685 tok->enc = NULL;
686 /* need to check line 1 and 2 separately since check_coding_spec
687 assumes a single line as input */
688 if (newl[0]) {
689 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
690 return error_ret(tok);
691 if (tok->enc == NULL && newl[1]) {
692 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
693 tok, buf_setreadl))
694 return error_ret(tok);
695 }
696 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000697#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000698 if (tok->enc != NULL) {
699 assert(utf8 == NULL);
700 utf8 = translate_into_utf8(str, tok->enc);
701 if (utf8 == NULL)
702 return error_ret(tok);
703 str = PyString_AsString(utf8);
704 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000705#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000706 assert(tok->decoding_buffer == NULL);
707 tok->decoding_buffer = utf8; /* CAUTION */
708 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000709}
710
711#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712
713/* Set up tokenizer for string */
714
715struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000716PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000718 struct tok_state *tok = tok_new();
719 if (tok == NULL)
720 return NULL;
721 str = (char *)decode_str(str, exec_input, tok);
722 if (str == NULL) {
723 PyTokenizer_Free(tok);
724 return NULL;
725 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000726
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000727 /* XXX: constify members. */
728 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
729 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000730}
731
732
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000733/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000734
735struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000736PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000738 struct tok_state *tok = tok_new();
739 if (tok == NULL)
740 return NULL;
741 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
742 PyTokenizer_Free(tok);
743 return NULL;
744 }
745 tok->cur = tok->inp = tok->buf;
746 tok->end = tok->buf + BUFSIZ;
747 tok->fp = fp;
748 tok->prompt = ps1;
749 tok->nextprompt = ps2;
750 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751}
752
753
754/* Free a tok_state structure */
755
756void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000757PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000759 if (tok->encoding != NULL)
760 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000761#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000762 Py_XDECREF(tok->decoding_readline);
763 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000764#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000765 if (tok->fp != NULL && tok->buf != NULL)
766 PyMem_FREE(tok->buf);
767 if (tok->input)
768 PyMem_FREE((char *)tok->input);
769 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770}
771
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000772#if !defined(PGEN) && defined(Py_USING_UNICODE)
773static int
774tok_stdin_decode(struct tok_state *tok, char **inp)
775{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000776 PyObject *enc, *sysstdin, *decoded, *utf8;
777 const char *encoding;
778 char *converted;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000779
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000780 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
781 return 0;
782 sysstdin = PySys_GetObject("stdin");
783 if (sysstdin == NULL || !PyFile_Check(sysstdin))
784 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000785
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000786 enc = ((PyFileObject *)sysstdin)->f_encoding;
787 if (enc == NULL || !PyString_Check(enc))
788 return 0;
789 Py_INCREF(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000791 encoding = PyString_AsString(enc);
792 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
793 if (decoded == NULL)
794 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000795
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000796 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
797 Py_DECREF(decoded);
798 if (utf8 == NULL)
799 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000800
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000801 assert(PyString_Check(utf8));
802 converted = new_string(PyString_AS_STRING(utf8),
803 PyString_GET_SIZE(utf8));
804 Py_DECREF(utf8);
805 if (converted == NULL)
806 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000807
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000808 PyMem_FREE(*inp);
809 *inp = converted;
810 if (tok->encoding != NULL)
811 PyMem_FREE(tok->encoding);
812 tok->encoding = new_string(encoding, strlen(encoding));
813 if (tok->encoding == NULL)
814 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000815
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000816 Py_DECREF(enc);
817 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000818
819error_nomem:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000820 Py_DECREF(enc);
821 tok->done = E_NOMEM;
822 return -1;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000823
824error_clear:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000825 Py_DECREF(enc);
826 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
827 tok->done = E_ERROR;
828 return -1;
829 }
830 /* Fallback to iso-8859-1: for backward compatibility */
831 PyErr_Clear();
832 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000833}
834#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835
836/* Get next char, updating state; error code goes into tok->done */
837
838static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000839tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000840{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000841 for (;;) {
842 if (tok->cur != tok->inp) {
843 return Py_CHARMASK(*tok->cur++); /* Fast path */
844 }
845 if (tok->done != E_OK)
846 return EOF;
847 if (tok->fp == NULL) {
848 char *end = strchr(tok->inp, '\n');
849 if (end != NULL)
850 end++;
851 else {
852 end = strchr(tok->inp, '\0');
853 if (end == tok->inp) {
854 tok->done = E_EOF;
855 return EOF;
856 }
857 }
858 if (tok->start == NULL)
859 tok->buf = tok->cur;
860 tok->line_start = tok->cur;
861 tok->lineno++;
862 tok->inp = end;
863 return Py_CHARMASK(*tok->cur++);
864 }
865 if (tok->prompt != NULL) {
866 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
867 if (tok->nextprompt != NULL)
868 tok->prompt = tok->nextprompt;
869 if (newtok == NULL)
870 tok->done = E_INTR;
871 else if (*newtok == '\0') {
872 PyMem_FREE(newtok);
873 tok->done = E_EOF;
874 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000875#if !defined(PGEN) && defined(Py_USING_UNICODE)
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000876 else if (tok_stdin_decode(tok, &newtok) != 0)
877 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000878#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000879 else if (tok->start != NULL) {
880 size_t start = tok->start - tok->buf;
881 size_t oldlen = tok->cur - tok->buf;
882 size_t newlen = oldlen + strlen(newtok);
883 char *buf = tok->buf;
884 buf = (char *)PyMem_REALLOC(buf, newlen+1);
885 tok->lineno++;
886 if (buf == NULL) {
887 PyMem_FREE(tok->buf);
888 tok->buf = NULL;
889 PyMem_FREE(newtok);
890 tok->done = E_NOMEM;
891 return EOF;
892 }
893 tok->buf = buf;
894 tok->cur = tok->buf + oldlen;
895 tok->line_start = tok->cur;
896 strcpy(tok->buf + oldlen, newtok);
897 PyMem_FREE(newtok);
898 tok->inp = tok->buf + newlen;
899 tok->end = tok->inp + 1;
900 tok->start = tok->buf + start;
901 }
902 else {
903 tok->lineno++;
904 if (tok->buf != NULL)
905 PyMem_FREE(tok->buf);
906 tok->buf = newtok;
907 tok->line_start = tok->buf;
908 tok->cur = tok->buf;
909 tok->line_start = tok->buf;
910 tok->inp = strchr(tok->buf, '\0');
911 tok->end = tok->inp + 1;
912 }
913 }
914 else {
915 int done = 0;
916 Py_ssize_t cur = 0;
917 char *pt;
918 if (tok->start == NULL) {
919 if (tok->buf == NULL) {
920 tok->buf = (char *)
921 PyMem_MALLOC(BUFSIZ);
922 if (tok->buf == NULL) {
923 tok->done = E_NOMEM;
924 return EOF;
925 }
926 tok->end = tok->buf + BUFSIZ;
927 }
928 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
929 tok) == NULL) {
930 tok->done = E_EOF;
931 done = 1;
932 }
933 else {
934 tok->done = E_OK;
935 tok->inp = strchr(tok->buf, '\0');
936 done = tok->inp[-1] == '\n';
937 }
938 }
939 else {
940 cur = tok->cur - tok->buf;
941 if (decoding_feof(tok)) {
942 tok->done = E_EOF;
943 done = 1;
944 }
945 else
946 tok->done = E_OK;
947 }
948 tok->lineno++;
949 /* Read until '\n' or EOF */
950 while (!done) {
951 Py_ssize_t curstart = tok->start == NULL ? -1 :
952 tok->start - tok->buf;
953 Py_ssize_t curvalid = tok->inp - tok->buf;
954 Py_ssize_t newsize = curvalid + BUFSIZ;
955 char *newbuf = tok->buf;
956 newbuf = (char *)PyMem_REALLOC(newbuf,
957 newsize);
958 if (newbuf == NULL) {
959 tok->done = E_NOMEM;
960 tok->cur = tok->inp;
961 return EOF;
962 }
963 tok->buf = newbuf;
964 tok->inp = tok->buf + curvalid;
965 tok->end = tok->buf + newsize;
966 tok->start = curstart < 0 ? NULL :
967 tok->buf + curstart;
968 if (decoding_fgets(tok->inp,
969 (int)(tok->end - tok->inp),
970 tok) == NULL) {
971 /* Break out early on decoding
972 errors, as tok->buf will be NULL
973 */
974 if (tok->decoding_erred)
975 return EOF;
976 /* Last line does not end in \n,
977 fake one */
978 strcpy(tok->inp, "\n");
979 }
980 tok->inp = strchr(tok->inp, '\0');
981 done = tok->inp[-1] == '\n';
982 }
983 if (tok->buf != NULL) {
984 tok->cur = tok->buf + cur;
985 tok->line_start = tok->cur;
986 /* replace "\r\n" with "\n" */
987 /* For Mac leave the \r, giving a syntax error */
988 pt = tok->inp - 2;
989 if (pt >= tok->buf && *pt == '\r') {
990 *pt++ = '\n';
991 *pt = '\0';
992 tok->inp = pt;
993 }
994 }
995 }
996 if (tok->done != E_OK) {
997 if (tok->prompt != NULL)
998 PySys_WriteStderr("\n");
999 tok->cur = tok->inp;
1000 return EOF;
1001 }
1002 }
1003 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001004}
1005
1006
1007/* Back-up one character */
1008
1009static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001010tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001011{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001012 if (c != EOF) {
1013 if (--tok->cur < tok->buf)
1014 Py_FatalError("tok_backup: beginning of buffer");
1015 if (*tok->cur != c)
1016 *tok->cur = c;
1017 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001018}
1019
1020
1021/* Return the token corresponding to a single character */
1022
1023int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001024PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001026 switch (c) {
1027 case '(': return LPAR;
1028 case ')': return RPAR;
1029 case '[': return LSQB;
1030 case ']': return RSQB;
1031 case ':': return COLON;
1032 case ',': return COMMA;
1033 case ';': return SEMI;
1034 case '+': return PLUS;
1035 case '-': return MINUS;
1036 case '*': return STAR;
1037 case '/': return SLASH;
1038 case '|': return VBAR;
1039 case '&': return AMPER;
1040 case '<': return LESS;
1041 case '>': return GREATER;
1042 case '=': return EQUAL;
1043 case '.': return DOT;
1044 case '%': return PERCENT;
1045 case '`': return BACKQUOTE;
1046 case '{': return LBRACE;
1047 case '}': return RBRACE;
1048 case '^': return CIRCUMFLEX;
1049 case '~': return TILDE;
1050 case '@': return AT;
1051 default: return OP;
1052 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001053}
1054
1055
Guido van Rossumfbab9051991-10-20 20:25:03 +00001056int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001057PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001058{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001059 switch (c1) {
1060 case '=':
1061 switch (c2) {
1062 case '=': return EQEQUAL;
1063 }
1064 break;
1065 case '!':
1066 switch (c2) {
1067 case '=': return NOTEQUAL;
1068 }
1069 break;
1070 case '<':
1071 switch (c2) {
1072 case '>': return NOTEQUAL;
1073 case '=': return LESSEQUAL;
1074 case '<': return LEFTSHIFT;
1075 }
1076 break;
1077 case '>':
1078 switch (c2) {
1079 case '=': return GREATEREQUAL;
1080 case '>': return RIGHTSHIFT;
1081 }
1082 break;
1083 case '+':
1084 switch (c2) {
1085 case '=': return PLUSEQUAL;
1086 }
1087 break;
1088 case '-':
1089 switch (c2) {
1090 case '=': return MINEQUAL;
1091 }
1092 break;
1093 case '*':
1094 switch (c2) {
1095 case '*': return DOUBLESTAR;
1096 case '=': return STAREQUAL;
1097 }
1098 break;
1099 case '/':
1100 switch (c2) {
1101 case '/': return DOUBLESLASH;
1102 case '=': return SLASHEQUAL;
1103 }
1104 break;
1105 case '|':
1106 switch (c2) {
1107 case '=': return VBAREQUAL;
1108 }
1109 break;
1110 case '%':
1111 switch (c2) {
1112 case '=': return PERCENTEQUAL;
1113 }
1114 break;
1115 case '&':
1116 switch (c2) {
1117 case '=': return AMPEREQUAL;
1118 }
1119 break;
1120 case '^':
1121 switch (c2) {
1122 case '=': return CIRCUMFLEXEQUAL;
1123 }
1124 break;
1125 }
1126 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001127}
1128
Thomas Wouters434d0822000-08-24 20:11:32 +00001129int
1130PyToken_ThreeChars(int c1, int c2, int c3)
1131{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001132 switch (c1) {
1133 case '<':
1134 switch (c2) {
1135 case '<':
1136 switch (c3) {
1137 case '=':
1138 return LEFTSHIFTEQUAL;
1139 }
1140 break;
1141 }
1142 break;
1143 case '>':
1144 switch (c2) {
1145 case '>':
1146 switch (c3) {
1147 case '=':
1148 return RIGHTSHIFTEQUAL;
1149 }
1150 break;
1151 }
1152 break;
1153 case '*':
1154 switch (c2) {
1155 case '*':
1156 switch (c3) {
1157 case '=':
1158 return DOUBLESTAREQUAL;
1159 }
1160 break;
1161 }
1162 break;
1163 case '/':
1164 switch (c2) {
1165 case '/':
1166 switch (c3) {
1167 case '=':
1168 return DOUBLESLASHEQUAL;
1169 }
1170 break;
1171 }
1172 break;
1173 }
1174 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001175}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001176
Guido van Rossum926f13a1998-04-09 21:38:06 +00001177static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001178indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001179{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001180 if (tok->alterror) {
1181 tok->done = E_TABSPACE;
1182 tok->cur = tok->inp;
1183 return 1;
1184 }
1185 if (tok->altwarning) {
1186 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1187 "in indentation\n", tok->filename);
1188 tok->altwarning = 0;
1189 }
1190 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001191}
1192
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193/* Get next token, after space stripping etc. */
1194
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001195static int
1196tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001198 register int c;
1199 int blankline;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001201 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001202 nextline:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001203 tok->start = NULL;
1204 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001206 /* Get indentation level */
1207 if (tok->atbol) {
1208 register int col = 0;
1209 register int altcol = 0;
1210 tok->atbol = 0;
1211 for (;;) {
1212 c = tok_nextc(tok);
1213 if (c == ' ')
1214 col++, altcol++;
1215 else if (c == '\t') {
1216 col = (col/tok->tabsize + 1) * tok->tabsize;
1217 altcol = (altcol/tok->alttabsize + 1)
1218 * tok->alttabsize;
1219 }
1220 else if (c == '\014') /* Control-L (formfeed) */
1221 col = altcol = 0; /* For Emacs users */
1222 else
1223 break;
1224 }
1225 tok_backup(tok, c);
1226 if (c == '#' || c == '\n') {
1227 /* Lines with only whitespace and/or comments
1228 shouldn't affect the indentation and are
1229 not passed to the parser as NEWLINE tokens,
1230 except *totally* empty lines in interactive
1231 mode, which signal the end of a command group. */
1232 if (col == 0 && c == '\n' && tok->prompt != NULL)
1233 blankline = 0; /* Let it through */
1234 else
1235 blankline = 1; /* Ignore completely */
1236 /* We can't jump back right here since we still
1237 may need to skip to the end of a comment */
1238 }
1239 if (!blankline && tok->level == 0) {
1240 if (col == tok->indstack[tok->indent]) {
1241 /* No change */
1242 if (altcol != tok->altindstack[tok->indent]) {
1243 if (indenterror(tok))
1244 return ERRORTOKEN;
1245 }
1246 }
1247 else if (col > tok->indstack[tok->indent]) {
1248 /* Indent -- always one */
1249 if (tok->indent+1 >= MAXINDENT) {
1250 tok->done = E_TOODEEP;
1251 tok->cur = tok->inp;
1252 return ERRORTOKEN;
1253 }
1254 if (altcol <= tok->altindstack[tok->indent]) {
1255 if (indenterror(tok))
1256 return ERRORTOKEN;
1257 }
1258 tok->pendin++;
1259 tok->indstack[++tok->indent] = col;
1260 tok->altindstack[tok->indent] = altcol;
1261 }
1262 else /* col < tok->indstack[tok->indent] */ {
1263 /* Dedent -- any number, must be consistent */
1264 while (tok->indent > 0 &&
1265 col < tok->indstack[tok->indent]) {
1266 tok->pendin--;
1267 tok->indent--;
1268 }
1269 if (col != tok->indstack[tok->indent]) {
1270 tok->done = E_DEDENT;
1271 tok->cur = tok->inp;
1272 return ERRORTOKEN;
1273 }
1274 if (altcol != tok->altindstack[tok->indent]) {
1275 if (indenterror(tok))
1276 return ERRORTOKEN;
1277 }
1278 }
1279 }
1280 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001281
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001282 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001283
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001284 /* Return pending indents/dedents */
1285 if (tok->pendin != 0) {
1286 if (tok->pendin < 0) {
1287 tok->pendin++;
1288 return DEDENT;
1289 }
1290 else {
1291 tok->pendin--;
1292 return INDENT;
1293 }
1294 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001295
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 again:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001297 tok->start = NULL;
1298 /* Skip spaces */
1299 do {
1300 c = tok_nextc(tok);
1301 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001302
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001303 /* Set start of current token */
1304 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001305
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001306 /* Skip comment, while looking for tab-setting magic */
1307 if (c == '#') {
1308 static char *tabforms[] = {
1309 "tab-width:", /* Emacs */
1310 ":tabstop=", /* vim, full form */
1311 ":ts=", /* vim, abbreviated form */
1312 "set tabsize=", /* will vi never die? */
1313 /* more templates can be added here to support other editors */
1314 };
1315 char cbuf[80];
1316 char *tp, **cp;
1317 tp = cbuf;
1318 do {
1319 *tp++ = c = tok_nextc(tok);
1320 } while (c != EOF && c != '\n' &&
1321 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1322 *tp = '\0';
1323 for (cp = tabforms;
1324 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1325 cp++) {
1326 if ((tp = strstr(cbuf, *cp))) {
1327 int newsize = atoi(tp + strlen(*cp));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001328
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001329 if (newsize >= 1 && newsize <= 40) {
1330 tok->tabsize = newsize;
1331 if (Py_VerboseFlag)
1332 PySys_WriteStderr(
1333 "Tab size set to %d\n",
1334 newsize);
1335 }
1336 }
1337 }
1338 while (c != EOF && c != '\n')
1339 c = tok_nextc(tok);
1340 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001341
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001342 /* Check for EOF and errors now */
1343 if (c == EOF) {
1344 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1345 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001346
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001347 /* Identifier (most frequent token!) */
1348 if (Py_ISALPHA(c) || c == '_') {
1349 /* Process r"", u"" and ur"" */
1350 switch (c) {
1351 case 'b':
1352 case 'B':
1353 c = tok_nextc(tok);
1354 if (c == 'r' || c == 'R')
1355 c = tok_nextc(tok);
1356 if (c == '"' || c == '\'')
1357 goto letter_quote;
1358 break;
1359 case 'r':
1360 case 'R':
1361 c = tok_nextc(tok);
1362 if (c == '"' || c == '\'')
1363 goto letter_quote;
1364 break;
1365 case 'u':
1366 case 'U':
1367 c = tok_nextc(tok);
1368 if (c == 'r' || c == 'R')
1369 c = tok_nextc(tok);
1370 if (c == '"' || c == '\'')
1371 goto letter_quote;
1372 break;
1373 }
Stefan Krah3db41612010-06-24 09:33:05 +00001374 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001375 c = tok_nextc(tok);
1376 }
1377 tok_backup(tok, c);
1378 *p_start = tok->start;
1379 *p_end = tok->cur;
1380 return NAME;
1381 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001382
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001383 /* Newline */
1384 if (c == '\n') {
1385 tok->atbol = 1;
1386 if (blankline || tok->level > 0)
1387 goto nextline;
1388 *p_start = tok->start;
1389 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1390 tok->cont_line = 0;
1391 return NEWLINE;
1392 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001393
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001394 /* Period or number starting with period? */
1395 if (c == '.') {
1396 c = tok_nextc(tok);
1397 if (isdigit(c)) {
1398 goto fraction;
1399 }
1400 else {
1401 tok_backup(tok, c);
1402 *p_start = tok->start;
1403 *p_end = tok->cur;
1404 return DOT;
1405 }
1406 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001407
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001408 /* Number */
1409 if (isdigit(c)) {
1410 if (c == '0') {
1411 /* Hex, octal or binary -- maybe. */
1412 c = tok_nextc(tok);
1413 if (c == '.')
1414 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001415#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001416 if (c == 'j' || c == 'J')
1417 goto imaginary;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001418#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001419 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001420
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001421 /* Hex */
1422 c = tok_nextc(tok);
1423 if (!isxdigit(c)) {
1424 tok->done = E_TOKEN;
1425 tok_backup(tok, c);
1426 return ERRORTOKEN;
1427 }
1428 do {
1429 c = tok_nextc(tok);
1430 } while (isxdigit(c));
1431 }
1432 else if (c == 'o' || c == 'O') {
1433 /* Octal */
1434 c = tok_nextc(tok);
1435 if (c < '0' || c >= '8') {
1436 tok->done = E_TOKEN;
1437 tok_backup(tok, c);
1438 return ERRORTOKEN;
1439 }
1440 do {
1441 c = tok_nextc(tok);
1442 } while ('0' <= c && c < '8');
1443 }
1444 else if (c == 'b' || c == 'B') {
1445 /* Binary */
1446 c = tok_nextc(tok);
1447 if (c != '0' && c != '1') {
1448 tok->done = E_TOKEN;
1449 tok_backup(tok, c);
1450 return ERRORTOKEN;
1451 }
1452 do {
1453 c = tok_nextc(tok);
1454 } while (c == '0' || c == '1');
1455 }
1456 else {
1457 int found_decimal = 0;
1458 /* Octal; c is first char of it */
1459 /* There's no 'isoctdigit' macro, sigh */
1460 while ('0' <= c && c < '8') {
1461 c = tok_nextc(tok);
1462 }
1463 if (isdigit(c)) {
1464 found_decimal = 1;
1465 do {
1466 c = tok_nextc(tok);
1467 } while (isdigit(c));
1468 }
1469 if (c == '.')
1470 goto fraction;
1471 else if (c == 'e' || c == 'E')
1472 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001473#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001474 else if (c == 'j' || c == 'J')
1475 goto imaginary;
Tim Petersd507dab2001-08-30 20:51:59 +00001476#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001477 else if (found_decimal) {
1478 tok->done = E_TOKEN;
1479 tok_backup(tok, c);
1480 return ERRORTOKEN;
1481 }
1482 }
1483 if (c == 'l' || c == 'L')
1484 c = tok_nextc(tok);
1485 }
1486 else {
1487 /* Decimal */
1488 do {
1489 c = tok_nextc(tok);
1490 } while (isdigit(c));
1491 if (c == 'l' || c == 'L')
1492 c = tok_nextc(tok);
1493 else {
1494 /* Accept floating point numbers. */
1495 if (c == '.') {
1496 fraction:
1497 /* Fraction */
1498 do {
1499 c = tok_nextc(tok);
1500 } while (isdigit(c));
1501 }
1502 if (c == 'e' || c == 'E') {
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001503 int e;
1504 exponent:
1505 e = c;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001506 /* Exponent part */
1507 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001508 if (c == '+' || c == '-') {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001509 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001510 if (!isdigit(c)) {
1511 tok->done = E_TOKEN;
1512 tok_backup(tok, c);
1513 return ERRORTOKEN;
1514 }
1515 } else if (!isdigit(c)) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001516 tok_backup(tok, c);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001517 tok_backup(tok, e);
1518 *p_start = tok->start;
1519 *p_end = tok->cur;
1520 return NUMBER;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001521 }
1522 do {
1523 c = tok_nextc(tok);
1524 } while (isdigit(c));
1525 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001526#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001527 if (c == 'j' || c == 'J')
1528 /* Imaginary part */
1529 imaginary:
1530 c = tok_nextc(tok);
Guido van Rossumf595fde1996-01-12 01:31:58 +00001531#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001532 }
1533 }
1534 tok_backup(tok, c);
1535 *p_start = tok->start;
1536 *p_end = tok->cur;
1537 return NUMBER;
1538 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001539
1540 letter_quote:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001541 /* String */
1542 if (c == '\'' || c == '"') {
1543 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1544 int quote = c;
1545 int triple = 0;
1546 int tripcount = 0;
1547 for (;;) {
1548 c = tok_nextc(tok);
1549 if (c == '\n') {
1550 if (!triple) {
1551 tok->done = E_EOLS;
1552 tok_backup(tok, c);
1553 return ERRORTOKEN;
1554 }
1555 tripcount = 0;
1556 tok->cont_line = 1; /* multiline string. */
1557 }
1558 else if (c == EOF) {
1559 if (triple)
1560 tok->done = E_EOFS;
1561 else
1562 tok->done = E_EOLS;
1563 tok->cur = tok->inp;
1564 return ERRORTOKEN;
1565 }
1566 else if (c == quote) {
1567 tripcount++;
1568 if (tok->cur - tok->start == quote2) {
1569 c = tok_nextc(tok);
1570 if (c == quote) {
1571 triple = 1;
1572 tripcount = 0;
1573 continue;
1574 }
1575 tok_backup(tok, c);
1576 }
1577 if (!triple || tripcount == 3)
1578 break;
1579 }
1580 else if (c == '\\') {
1581 tripcount = 0;
1582 c = tok_nextc(tok);
1583 if (c == EOF) {
1584 tok->done = E_EOLS;
1585 tok->cur = tok->inp;
1586 return ERRORTOKEN;
1587 }
1588 }
1589 else
1590 tripcount = 0;
1591 }
1592 *p_start = tok->start;
1593 *p_end = tok->cur;
1594 return STRING;
1595 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001596
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001597 /* Line continuation */
1598 if (c == '\\') {
1599 c = tok_nextc(tok);
1600 if (c != '\n') {
1601 tok->done = E_LINECONT;
1602 tok->cur = tok->inp;
1603 return ERRORTOKEN;
1604 }
1605 tok->cont_line = 1;
1606 goto again; /* Read next line */
1607 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001608
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001609 /* Check for two-character token */
1610 {
1611 int c2 = tok_nextc(tok);
1612 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001613#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001614 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1615 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1616 "<> not supported in 3.x; use !=",
1617 tok->filename, tok->lineno,
1618 NULL, NULL)) {
1619 return ERRORTOKEN;
1620 }
1621 }
Christian Heimes02c9ab52007-11-23 12:12:02 +00001622#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001623 if (token != OP) {
1624 int c3 = tok_nextc(tok);
1625 int token3 = PyToken_ThreeChars(c, c2, c3);
1626 if (token3 != OP) {
1627 token = token3;
1628 } else {
1629 tok_backup(tok, c3);
1630 }
1631 *p_start = tok->start;
1632 *p_end = tok->cur;
1633 return token;
1634 }
1635 tok_backup(tok, c2);
1636 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001637
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001638 /* Keep track of parentheses nesting level */
1639 switch (c) {
1640 case '(':
1641 case '[':
1642 case '{':
1643 tok->level++;
1644 break;
1645 case ')':
1646 case ']':
1647 case '}':
1648 tok->level--;
1649 break;
1650 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001651
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001652 /* Punctuation character */
1653 *p_start = tok->start;
1654 *p_end = tok->cur;
1655 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001656}
1657
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001658int
1659PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1660{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001661 int result = tok_get(tok, p_start, p_end);
1662 if (tok->decoding_erred) {
1663 result = ERRORTOKEN;
1664 tok->done = E_DECODE;
1665 }
1666 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001667}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001668
Martin v. Löwisa5136192007-09-04 14:19:28 +00001669/* This function is only called from parsetok. However, it cannot live
1670 there, as it must be empty for PGEN, and we can check for PGEN only
1671 in this file. */
1672
Christian Heimes082c9b02008-01-23 14:20:50 +00001673#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001674char*
1675PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1676{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001677 return NULL;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001678}
1679#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001680#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001681static PyObject *
1682dec_utf8(const char *enc, const char *text, size_t len) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001683 PyObject *ret = NULL;
1684 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1685 if (unicode_text) {
1686 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1687 Py_DECREF(unicode_text);
1688 }
1689 if (!ret) {
1690 PyErr_Clear();
1691 }
1692 return ret;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001693}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001694char *
1695PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1696{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001697 char *text = NULL;
1698 if (tok->encoding) {
1699 /* convert source to original encondig */
1700 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1701 if (lineobj != NULL) {
1702 int linelen = PyString_Size(lineobj);
1703 const char *line = PyString_AsString(lineobj);
1704 text = PyObject_MALLOC(linelen + 1);
1705 if (text != NULL && line != NULL) {
1706 if (linelen)
1707 strncpy(text, line, linelen);
1708 text[linelen] = '\0';
1709 }
1710 Py_DECREF(lineobj);
1711
1712 /* adjust error offset */
1713 if (*offset > 1) {
1714 PyObject *offsetobj = dec_utf8(tok->encoding,
1715 tok->buf, *offset-1);
1716 if (offsetobj) {
1717 *offset = PyString_Size(offsetobj) + 1;
1718 Py_DECREF(offsetobj);
1719 }
1720 }
1721
1722 }
1723 }
1724 return text;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001725
1726}
Georg Brandl76b30d12008-01-07 18:41:34 +00001727#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001728#endif
1729
Martin v. Löwisa5136192007-09-04 14:19:28 +00001730
Guido van Rossum408027e1996-12-30 16:17:54 +00001731#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001732
1733void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001734tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001735{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001736 printf("%s", _PyParser_TokenNames[type]);
1737 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1738 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001739}
1740
1741#endif