blob: 61bfb4e1b7afb3914b47a47d07ad3bd1bc0b6b13 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093};
94
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000098tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000130 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131}
132
Benjamin Petersone36199b2009-11-12 23:39:44 +0000133static char *
134new_string(const char *s, Py_ssize_t len)
135{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000142}
143
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000155 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000156}
157
Benjamin Petersone36199b2009-11-12 23:39:44 +0000158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000161 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
Serhiy Storchaka5d7d26c2015-11-14 15:14:29 +0200172 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
173 tok->done = E_DECODE;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000174 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175}
176
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177
178static char *
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000179get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000212 Py_ssize_t i;
213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000231
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000232 begin = t;
233 while (Py_ISALNUM(t[0]) ||
234 t[0] == '-' || t[0] == '_' || t[0] == '.')
235 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
Benjamin Peterson223546d2015-08-13 21:52:56 -0700239 char* q;
240 if (!r)
241 return NULL;
242 q = get_normal_name(r);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000243 if (r != q) {
244 PyMem_FREE(r);
245 r = new_string(q, strlen(q));
246 }
247 return r;
248 }
249 }
250 }
251 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000252}
253
254/* Check whether the line contains a coding spec. If it does,
255 invoke the set_readline function for the new encoding.
256 This function receives the tok_state and the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000260check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000261 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000263 char * cs;
264 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300266 if (tok->cont_line) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000267 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300268 tok->read_coding_spec = 1;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000269 return 1;
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300270 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000271 cs = get_coding_spec(line, size);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300272 if (!cs) {
273 Py_ssize_t i;
274 for (i = 0; i < size; i++) {
275 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
276 break;
277 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
278 /* Stop checking coding spec after a line containing
279 * anything except a comment. */
280 tok->read_coding_spec = 1;
281 break;
282 }
283 }
284 } else {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000285 tok->read_coding_spec = 1;
286 if (tok->encoding == NULL) {
287 assert(tok->decoding_state == 1); /* raw */
288 if (strcmp(cs, "utf-8") == 0 ||
289 strcmp(cs, "iso-8859-1") == 0) {
290 tok->encoding = cs;
291 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000292#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000293 r = set_readline(tok, cs);
294 if (r) {
295 tok->encoding = cs;
296 tok->decoding_state = -1;
297 }
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300298 else {
299 PyErr_Format(PyExc_SyntaxError,
300 "encoding problem: %s", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000301 PyMem_FREE(cs);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300302 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000303#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000304 /* Without Unicode support, we cannot
305 process the coding spec. Since there
306 won't be any Unicode literals, that
307 won't matter. */
308 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000309#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000310 }
311 } else { /* then, compare cs with BOM */
312 r = (strcmp(tok->encoding, cs) == 0);
Serhiy Storchaka729ad5c2013-06-09 16:54:56 +0300313 if (!r)
314 PyErr_Format(PyExc_SyntaxError,
315 "encoding problem: %s with BOM", cs);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000316 PyMem_FREE(cs);
317 }
318 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000319 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320}
321
322/* See whether the file starts with a BOM. If it does,
323 invoke the set_readline function with the new encoding.
324 Return 1 on success, 0 on failure. */
325
326static int
327check_bom(int get_char(struct tok_state *),
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000328 void unget_char(int, struct tok_state *),
329 int set_readline(struct tok_state *, const char *),
330 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000331{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000332 int ch1, ch2, ch3;
333 ch1 = get_char(tok);
334 tok->decoding_state = 1;
335 if (ch1 == EOF) {
336 return 1;
337 } else if (ch1 == 0xEF) {
338 ch2 = get_char(tok);
339 if (ch2 != 0xBB) {
340 unget_char(ch2, tok);
341 unget_char(ch1, tok);
342 return 1;
343 }
344 ch3 = get_char(tok);
345 if (ch3 != 0xBF) {
346 unget_char(ch3, tok);
347 unget_char(ch2, tok);
348 unget_char(ch1, tok);
349 return 1;
350 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351#if 0
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000352 /* Disable support for UTF-16 BOMs until a decision
353 is made whether this needs to be supported. */
354 } else if (ch1 == 0xFE) {
355 ch2 = get_char(tok);
356 if (ch2 != 0xFF) {
357 unget_char(ch2, tok);
358 unget_char(ch1, tok);
359 return 1;
360 }
361 if (!set_readline(tok, "utf-16-be"))
362 return 0;
363 tok->decoding_state = -1;
364 } else if (ch1 == 0xFF) {
365 ch2 = get_char(tok);
366 if (ch2 != 0xFE) {
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
371 if (!set_readline(tok, "utf-16-le"))
372 return 0;
373 tok->decoding_state = -1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000375 } else {
376 unget_char(ch1, tok);
377 return 1;
378 }
379 if (tok->encoding != NULL)
380 PyMem_FREE(tok->encoding);
381 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
382 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383}
384
385/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000387
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 On entry, tok->decoding_buffer will be one of:
389 1) NULL: need to call tok->decoding_readline to get a new line
390 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000391 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000392 3) PyStringObject *: previous call to fp_readl did not have enough room
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000393 (in the s buffer) to copy entire contents of the line read
394 by tok->decoding_readline. tok->decoding_buffer has the overflow.
395 In this case, fp_readl is called in a loop (with an expanded buffer)
396 until the buffer ends with a '\n' (or until the end of the file is
397 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000398*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399
400static char *
401fp_readl(char *s, int size, struct tok_state *tok)
402{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000403#ifndef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000404 /* In a non-Unicode built, this should never be called. */
405 Py_FatalError("fp_readl should not be called in this build.");
406 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000407#else
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000408 PyObject* utf8 = NULL;
409 PyObject* buf = tok->decoding_buffer;
410 char *str;
411 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000412
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000413 /* Ask for one less byte so we can terminate it */
414 assert(size > 0);
415 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000417 if (buf == NULL) {
418 buf = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (buf == NULL)
420 return error_ret(tok);
Benjamin Peterson22d9ee72013-12-28 10:33:58 -0600421 if (!PyUnicode_Check(buf)) {
422 Py_DECREF(buf);
423 PyErr_SetString(PyExc_SyntaxError,
424 "codec did not return a unicode object");
425 return error_ret(tok);
426 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000427 } else {
428 tok->decoding_buffer = NULL;
429 if (PyString_CheckExact(buf))
430 utf8 = buf;
431 }
432 if (utf8 == NULL) {
433 utf8 = PyUnicode_AsUTF8String(buf);
434 Py_DECREF(buf);
435 if (utf8 == NULL)
436 return error_ret(tok);
437 }
438 str = PyString_AsString(utf8);
439 utf8len = PyString_GET_SIZE(utf8);
440 if (utf8len > size) {
441 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
442 if (tok->decoding_buffer == NULL) {
443 Py_DECREF(utf8);
444 return error_ret(tok);
445 }
446 utf8len = size;
447 }
448 memcpy(s, str, utf8len);
449 s[utf8len] = '\0';
450 Py_DECREF(utf8);
451 if (utf8len == 0)
452 return NULL; /* EOF */
453 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000454#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000455}
456
457/* Set the readline function for TOK to a StreamReader's
458 readline function. The StreamReader is named ENC.
459
460 This function is called from check_bom and check_coding_spec.
461
462 ENC is usually identical to the future value of tok->encoding,
463 except for the (currently unsupported) case of UTF-16.
464
465 Return 1 on success, 0 on failure. */
466
467static int
468fp_setreadl(struct tok_state *tok, const char* enc)
469{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000470 PyObject *reader, *stream, *readline;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000472 /* XXX: constify filename argument. */
473 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
474 if (stream == NULL)
475 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000477 reader = PyCodec_StreamReader(enc, stream, NULL);
478 Py_DECREF(stream);
479 if (reader == NULL)
480 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000482 readline = PyObject_GetAttrString(reader, "readline");
483 Py_DECREF(reader);
484 if (readline == NULL)
485 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000487 tok->decoding_readline = readline;
488 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000489}
490
491/* Fetch the next byte from TOK. */
492
493static int fp_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000494 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495}
496
497/* Unfetch the last byte back into TOK. */
498
499static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000500 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000501}
502
503/* Read a line of input from TOK. Determine encoding
504 if necessary. */
505
506static char *
507decoding_fgets(char *s, int size, struct tok_state *tok)
508{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000509 char *line = NULL;
510 int badchar = 0;
511 for (;;) {
512 if (tok->decoding_state < 0) {
513 /* We already have a codec associated with
514 this input. */
515 line = fp_readl(s, size, tok);
516 break;
517 } else if (tok->decoding_state > 0) {
518 /* We want a 'raw' read. */
519 line = Py_UniversalNewlineFgets(s, size,
520 tok->fp, NULL);
521 break;
522 } else {
523 /* We have not yet determined the encoding.
524 If an encoding is found, use the file-pointer
525 reader functions from now on. */
526 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
527 return error_ret(tok);
528 assert(tok->decoding_state != 0);
529 }
530 }
531 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
532 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
533 return error_ret(tok);
534 }
535 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000536#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000537 /* The default encoding is ASCII, so make sure we don't have any
538 non-ASCII bytes in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 for (c = (unsigned char *)line; *c; c++)
542 if (*c > 127) {
543 badchar = *c;
544 break;
545 }
546 }
547 if (badchar) {
548 char buf[500];
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
551 sprintf(buf,
552 "Non-ASCII character '\\x%.2x' "
553 "in file %.200s on line %i, "
554 "but no encoding declared; "
Ned Deily24b82092014-06-17 12:24:53 -0700555 "see http://python.org/dev/peps/pep-0263/ for details",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000556 badchar, tok->filename, tok->lineno + 1);
557 PyErr_SetString(PyExc_SyntaxError, buf);
558 return error_ret(tok);
559 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000561 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562}
563
564static int
565decoding_feof(struct tok_state *tok)
566{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000567 if (tok->decoding_state >= 0) {
568 return feof(tok->fp);
569 } else {
570 PyObject* buf = tok->decoding_buffer;
571 if (buf == NULL) {
572 buf = PyObject_CallObject(tok->decoding_readline, NULL);
573 if (buf == NULL) {
574 error_ret(tok);
575 return 1;
576 } else {
577 tok->decoding_buffer = buf;
578 }
579 }
580 return PyObject_Length(buf) == 0;
581 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582}
583
584/* Fetch a byte from TOK, using the string buffer. */
585
Tim Petersc9d78aa2006-03-26 23:27:58 +0000586static int
587buf_getc(struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000588 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589}
590
591/* Unfetch a byte from TOK, using the string buffer. */
592
Tim Petersc9d78aa2006-03-26 23:27:58 +0000593static void
594buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000595 tok->str--;
596 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597}
598
599/* Set the readline function for TOK to ENC. For the string-based
600 tokenizer, this means to just record the encoding. */
601
Tim Petersc9d78aa2006-03-26 23:27:58 +0000602static int
603buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000604 tok->enc = enc;
605 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606}
607
608/* Return a UTF-8 encoding Python string object from the
609 C byte string STR, which is encoded with ENC. */
610
Martin v. Löwis019934b2002-08-07 12:33:18 +0000611#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000612static PyObject *
613translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000614 PyObject *utf8;
615 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
616 if (buf == NULL)
617 return NULL;
618 utf8 = PyUnicode_AsUTF8String(buf);
619 Py_DECREF(buf);
620 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000622#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623
Benjamin Petersone36199b2009-11-12 23:39:44 +0000624
625static char *
626translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000627 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
628 char *buf, *current;
629 char c = '\0';
630 buf = PyMem_MALLOC(needed_length);
631 if (buf == NULL) {
632 tok->done = E_NOMEM;
633 return NULL;
634 }
635 for (current = buf; *s; s++, current++) {
636 c = *s;
637 if (skip_next_lf) {
638 skip_next_lf = 0;
639 if (c == '\n') {
640 c = *++s;
641 if (!c)
642 break;
643 }
644 }
645 if (c == '\r') {
646 skip_next_lf = 1;
647 c = '\n';
648 }
649 *current = c;
650 }
651 /* If this is exec input, add a newline to the end of the string if
652 there isn't one already. */
653 if (exec_input && c != '\n') {
654 *current = '\n';
655 current++;
656 }
657 *current = '\0';
658 final_length = current - buf + 1;
659 if (final_length < needed_length && final_length)
660 /* should never fail */
661 buf = PyMem_REALLOC(buf, final_length);
662 return buf;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000663}
664
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665/* Decode a byte string STR for use as the buffer of TOK.
666 Look for encoding declarations inside STR, and record them
667 inside TOK. */
668
669static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000670decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000671{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000672 PyObject* utf8 = NULL;
673 const char *str;
674 const char *s;
675 const char *newl[2] = {NULL, NULL};
676 int lineno = 0;
677 tok->input = str = translate_newlines(input, single, tok);
678 if (str == NULL)
679 return NULL;
680 tok->enc = NULL;
681 tok->str = str;
682 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
683 return error_ret(tok);
684 str = tok->str; /* string after BOM if any */
685 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000686#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000687 if (tok->enc != NULL) {
688 utf8 = translate_into_utf8(str, tok->enc);
689 if (utf8 == NULL)
690 return error_ret(tok);
691 str = PyString_AsString(utf8);
692 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000693#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000694 for (s = str;; s++) {
695 if (*s == '\0') break;
696 else if (*s == '\n') {
697 assert(lineno < 2);
698 newl[lineno] = s;
699 lineno++;
700 if (lineno == 2) break;
701 }
702 }
703 tok->enc = NULL;
704 /* need to check line 1 and 2 separately since check_coding_spec
705 assumes a single line as input */
706 if (newl[0]) {
707 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708 return error_ret(tok);
Serhiy Storchaka3eb554f2014-09-05 10:22:05 +0300709 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000710 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711 tok, buf_setreadl))
712 return error_ret(tok);
713 }
714 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000715#ifdef Py_USING_UNICODE
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000716 if (tok->enc != NULL) {
717 assert(utf8 == NULL);
718 utf8 = translate_into_utf8(str, tok->enc);
719 if (utf8 == NULL)
720 return error_ret(tok);
721 str = PyString_AsString(utf8);
722 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000723#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000724 assert(tok->decoding_buffer == NULL);
725 tok->decoding_buffer = utf8; /* CAUTION */
726 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000727}
728
729#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000730
731/* Set up tokenizer for string */
732
733struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000734PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000736 struct tok_state *tok = tok_new();
737 if (tok == NULL)
738 return NULL;
739 str = (char *)decode_str(str, exec_input, tok);
740 if (str == NULL) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000744
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000745 /* XXX: constify members. */
746 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
747 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748}
749
750
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000751/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752
753struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000754PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
759 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769}
770
771
772/* Free a tok_state structure */
773
774void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000775PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000777 if (tok->encoding != NULL)
778 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000779#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000780 Py_XDECREF(tok->decoding_readline);
781 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000782#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000783 if (tok->fp != NULL && tok->buf != NULL)
784 PyMem_FREE(tok->buf);
785 if (tok->input)
786 PyMem_FREE((char *)tok->input);
787 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788}
789
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790#if !defined(PGEN) && defined(Py_USING_UNICODE)
791static int
792tok_stdin_decode(struct tok_state *tok, char **inp)
793{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000794 PyObject *enc, *sysstdin, *decoded, *utf8;
795 const char *encoding;
796 char *converted;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000797
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000798 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
799 return 0;
800 sysstdin = PySys_GetObject("stdin");
801 if (sysstdin == NULL || !PyFile_Check(sysstdin))
802 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000803
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000804 enc = ((PyFileObject *)sysstdin)->f_encoding;
805 if (enc == NULL || !PyString_Check(enc))
806 return 0;
807 Py_INCREF(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000808
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000809 encoding = PyString_AsString(enc);
810 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
811 if (decoded == NULL)
812 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000813
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000814 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
815 Py_DECREF(decoded);
816 if (utf8 == NULL)
817 goto error_clear;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000818
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000819 assert(PyString_Check(utf8));
820 converted = new_string(PyString_AS_STRING(utf8),
821 PyString_GET_SIZE(utf8));
822 Py_DECREF(utf8);
823 if (converted == NULL)
824 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000825
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000826 PyMem_FREE(*inp);
827 *inp = converted;
828 if (tok->encoding != NULL)
829 PyMem_FREE(tok->encoding);
830 tok->encoding = new_string(encoding, strlen(encoding));
831 if (tok->encoding == NULL)
832 goto error_nomem;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000833
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000834 Py_DECREF(enc);
835 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000836
837error_nomem:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000838 Py_DECREF(enc);
839 tok->done = E_NOMEM;
840 return -1;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000841
842error_clear:
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000843 Py_DECREF(enc);
844 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
845 tok->done = E_ERROR;
846 return -1;
847 }
848 /* Fallback to iso-8859-1: for backward compatibility */
849 PyErr_Clear();
850 return 0;
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000851}
852#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853
854/* Get next char, updating state; error code goes into tok->done */
855
856static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000857tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858{
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000859 for (;;) {
860 if (tok->cur != tok->inp) {
861 return Py_CHARMASK(*tok->cur++); /* Fast path */
862 }
863 if (tok->done != E_OK)
864 return EOF;
865 if (tok->fp == NULL) {
866 char *end = strchr(tok->inp, '\n');
867 if (end != NULL)
868 end++;
869 else {
870 end = strchr(tok->inp, '\0');
871 if (end == tok->inp) {
872 tok->done = E_EOF;
873 return EOF;
874 }
875 }
876 if (tok->start == NULL)
877 tok->buf = tok->cur;
878 tok->line_start = tok->cur;
879 tok->lineno++;
880 tok->inp = end;
881 return Py_CHARMASK(*tok->cur++);
882 }
883 if (tok->prompt != NULL) {
884 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
885 if (tok->nextprompt != NULL)
886 tok->prompt = tok->nextprompt;
887 if (newtok == NULL)
888 tok->done = E_INTR;
889 else if (*newtok == '\0') {
890 PyMem_FREE(newtok);
891 tok->done = E_EOF;
892 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000893#if !defined(PGEN) && defined(Py_USING_UNICODE)
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000894 else if (tok_stdin_decode(tok, &newtok) != 0)
895 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000896#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000897 else if (tok->start != NULL) {
898 size_t start = tok->start - tok->buf;
899 size_t oldlen = tok->cur - tok->buf;
900 size_t newlen = oldlen + strlen(newtok);
901 char *buf = tok->buf;
902 buf = (char *)PyMem_REALLOC(buf, newlen+1);
903 tok->lineno++;
904 if (buf == NULL) {
905 PyMem_FREE(tok->buf);
906 tok->buf = NULL;
907 PyMem_FREE(newtok);
908 tok->done = E_NOMEM;
909 return EOF;
910 }
911 tok->buf = buf;
912 tok->cur = tok->buf + oldlen;
913 tok->line_start = tok->cur;
914 strcpy(tok->buf + oldlen, newtok);
915 PyMem_FREE(newtok);
916 tok->inp = tok->buf + newlen;
917 tok->end = tok->inp + 1;
918 tok->start = tok->buf + start;
919 }
920 else {
921 tok->lineno++;
922 if (tok->buf != NULL)
923 PyMem_FREE(tok->buf);
924 tok->buf = newtok;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000925 tok->cur = tok->buf;
926 tok->line_start = tok->buf;
927 tok->inp = strchr(tok->buf, '\0');
928 tok->end = tok->inp + 1;
929 }
930 }
931 else {
932 int done = 0;
933 Py_ssize_t cur = 0;
934 char *pt;
935 if (tok->start == NULL) {
936 if (tok->buf == NULL) {
937 tok->buf = (char *)
938 PyMem_MALLOC(BUFSIZ);
939 if (tok->buf == NULL) {
940 tok->done = E_NOMEM;
941 return EOF;
942 }
943 tok->end = tok->buf + BUFSIZ;
944 }
945 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
946 tok) == NULL) {
Serhiy Storchaka5d7d26c2015-11-14 15:14:29 +0200947 if (!tok->decoding_erred)
948 tok->done = E_EOF;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000949 done = 1;
950 }
951 else {
952 tok->done = E_OK;
953 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson274a7632016-09-18 23:41:11 -0700954 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000955 }
956 }
957 else {
958 cur = tok->cur - tok->buf;
959 if (decoding_feof(tok)) {
960 tok->done = E_EOF;
961 done = 1;
962 }
963 else
964 tok->done = E_OK;
965 }
966 tok->lineno++;
967 /* Read until '\n' or EOF */
968 while (!done) {
969 Py_ssize_t curstart = tok->start == NULL ? -1 :
970 tok->start - tok->buf;
971 Py_ssize_t curvalid = tok->inp - tok->buf;
972 Py_ssize_t newsize = curvalid + BUFSIZ;
973 char *newbuf = tok->buf;
974 newbuf = (char *)PyMem_REALLOC(newbuf,
975 newsize);
976 if (newbuf == NULL) {
977 tok->done = E_NOMEM;
978 tok->cur = tok->inp;
979 return EOF;
980 }
981 tok->buf = newbuf;
Serhiy Storchaka5d7d26c2015-11-14 15:14:29 +0200982 tok->cur = tok->buf + cur;
983 tok->line_start = tok->cur;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000984 tok->inp = tok->buf + curvalid;
985 tok->end = tok->buf + newsize;
986 tok->start = curstart < 0 ? NULL :
987 tok->buf + curstart;
988 if (decoding_fgets(tok->inp,
989 (int)(tok->end - tok->inp),
990 tok) == NULL) {
991 /* Break out early on decoding
992 errors, as tok->buf will be NULL
993 */
994 if (tok->decoding_erred)
995 return EOF;
996 /* Last line does not end in \n,
997 fake one */
998 strcpy(tok->inp, "\n");
999 }
1000 tok->inp = strchr(tok->inp, '\0');
1001 done = tok->inp[-1] == '\n';
1002 }
1003 if (tok->buf != NULL) {
1004 tok->cur = tok->buf + cur;
1005 tok->line_start = tok->cur;
1006 /* replace "\r\n" with "\n" */
1007 /* For Mac leave the \r, giving a syntax error */
1008 pt = tok->inp - 2;
1009 if (pt >= tok->buf && *pt == '\r') {
1010 *pt++ = '\n';
1011 *pt = '\0';
1012 tok->inp = pt;
1013 }
1014 }
1015 }
1016 if (tok->done != E_OK) {
1017 if (tok->prompt != NULL)
1018 PySys_WriteStderr("\n");
1019 tok->cur = tok->inp;
1020 return EOF;
1021 }
1022 }
1023 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001024}
1025
1026
1027/* Back-up one character */
1028
1029static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001030tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001032 if (c != EOF) {
1033 if (--tok->cur < tok->buf)
1034 Py_FatalError("tok_backup: beginning of buffer");
1035 if (*tok->cur != c)
1036 *tok->cur = c;
1037 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001038}
1039
1040
1041/* Return the token corresponding to a single character */
1042
1043int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001044PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001045{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001046 switch (c) {
1047 case '(': return LPAR;
1048 case ')': return RPAR;
1049 case '[': return LSQB;
1050 case ']': return RSQB;
1051 case ':': return COLON;
1052 case ',': return COMMA;
1053 case ';': return SEMI;
1054 case '+': return PLUS;
1055 case '-': return MINUS;
1056 case '*': return STAR;
1057 case '/': return SLASH;
1058 case '|': return VBAR;
1059 case '&': return AMPER;
1060 case '<': return LESS;
1061 case '>': return GREATER;
1062 case '=': return EQUAL;
1063 case '.': return DOT;
1064 case '%': return PERCENT;
1065 case '`': return BACKQUOTE;
1066 case '{': return LBRACE;
1067 case '}': return RBRACE;
1068 case '^': return CIRCUMFLEX;
1069 case '~': return TILDE;
1070 case '@': return AT;
1071 default: return OP;
1072 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073}
1074
1075
Guido van Rossumfbab9051991-10-20 20:25:03 +00001076int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001077PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001078{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001079 switch (c1) {
1080 case '=':
1081 switch (c2) {
1082 case '=': return EQEQUAL;
1083 }
1084 break;
1085 case '!':
1086 switch (c2) {
1087 case '=': return NOTEQUAL;
1088 }
1089 break;
1090 case '<':
1091 switch (c2) {
1092 case '>': return NOTEQUAL;
1093 case '=': return LESSEQUAL;
1094 case '<': return LEFTSHIFT;
1095 }
1096 break;
1097 case '>':
1098 switch (c2) {
1099 case '=': return GREATEREQUAL;
1100 case '>': return RIGHTSHIFT;
1101 }
1102 break;
1103 case '+':
1104 switch (c2) {
1105 case '=': return PLUSEQUAL;
1106 }
1107 break;
1108 case '-':
1109 switch (c2) {
1110 case '=': return MINEQUAL;
1111 }
1112 break;
1113 case '*':
1114 switch (c2) {
1115 case '*': return DOUBLESTAR;
1116 case '=': return STAREQUAL;
1117 }
1118 break;
1119 case '/':
1120 switch (c2) {
1121 case '/': return DOUBLESLASH;
1122 case '=': return SLASHEQUAL;
1123 }
1124 break;
1125 case '|':
1126 switch (c2) {
1127 case '=': return VBAREQUAL;
1128 }
1129 break;
1130 case '%':
1131 switch (c2) {
1132 case '=': return PERCENTEQUAL;
1133 }
1134 break;
1135 case '&':
1136 switch (c2) {
1137 case '=': return AMPEREQUAL;
1138 }
1139 break;
1140 case '^':
1141 switch (c2) {
1142 case '=': return CIRCUMFLEXEQUAL;
1143 }
1144 break;
1145 }
1146 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001147}
1148
Thomas Wouters434d0822000-08-24 20:11:32 +00001149int
1150PyToken_ThreeChars(int c1, int c2, int c3)
1151{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001152 switch (c1) {
1153 case '<':
1154 switch (c2) {
1155 case '<':
1156 switch (c3) {
1157 case '=':
1158 return LEFTSHIFTEQUAL;
1159 }
1160 break;
1161 }
1162 break;
1163 case '>':
1164 switch (c2) {
1165 case '>':
1166 switch (c3) {
1167 case '=':
1168 return RIGHTSHIFTEQUAL;
1169 }
1170 break;
1171 }
1172 break;
1173 case '*':
1174 switch (c2) {
1175 case '*':
1176 switch (c3) {
1177 case '=':
1178 return DOUBLESTAREQUAL;
1179 }
1180 break;
1181 }
1182 break;
1183 case '/':
1184 switch (c2) {
1185 case '/':
1186 switch (c3) {
1187 case '=':
1188 return DOUBLESLASHEQUAL;
1189 }
1190 break;
1191 }
1192 break;
1193 }
1194 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001195}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001196
Guido van Rossum926f13a1998-04-09 21:38:06 +00001197static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001198indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001199{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001200 if (tok->alterror) {
1201 tok->done = E_TABSPACE;
1202 tok->cur = tok->inp;
1203 return 1;
1204 }
1205 if (tok->altwarning) {
1206 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1207 "in indentation\n", tok->filename);
1208 tok->altwarning = 0;
1209 }
1210 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001211}
1212
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213/* Get next token, after space stripping etc. */
1214
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001215static int
1216tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001218 register int c;
1219 int blankline;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001220
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001221 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001222 nextline:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001223 tok->start = NULL;
1224 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001225
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001226 /* Get indentation level */
1227 if (tok->atbol) {
1228 register int col = 0;
1229 register int altcol = 0;
1230 tok->atbol = 0;
1231 for (;;) {
1232 c = tok_nextc(tok);
1233 if (c == ' ')
1234 col++, altcol++;
1235 else if (c == '\t') {
1236 col = (col/tok->tabsize + 1) * tok->tabsize;
1237 altcol = (altcol/tok->alttabsize + 1)
1238 * tok->alttabsize;
1239 }
1240 else if (c == '\014') /* Control-L (formfeed) */
1241 col = altcol = 0; /* For Emacs users */
1242 else
1243 break;
1244 }
1245 tok_backup(tok, c);
1246 if (c == '#' || c == '\n') {
1247 /* Lines with only whitespace and/or comments
1248 shouldn't affect the indentation and are
1249 not passed to the parser as NEWLINE tokens,
1250 except *totally* empty lines in interactive
1251 mode, which signal the end of a command group. */
1252 if (col == 0 && c == '\n' && tok->prompt != NULL)
1253 blankline = 0; /* Let it through */
1254 else
1255 blankline = 1; /* Ignore completely */
1256 /* We can't jump back right here since we still
1257 may need to skip to the end of a comment */
1258 }
1259 if (!blankline && tok->level == 0) {
1260 if (col == tok->indstack[tok->indent]) {
1261 /* No change */
1262 if (altcol != tok->altindstack[tok->indent]) {
1263 if (indenterror(tok))
1264 return ERRORTOKEN;
1265 }
1266 }
1267 else if (col > tok->indstack[tok->indent]) {
1268 /* Indent -- always one */
1269 if (tok->indent+1 >= MAXINDENT) {
1270 tok->done = E_TOODEEP;
1271 tok->cur = tok->inp;
1272 return ERRORTOKEN;
1273 }
1274 if (altcol <= tok->altindstack[tok->indent]) {
1275 if (indenterror(tok))
1276 return ERRORTOKEN;
1277 }
1278 tok->pendin++;
1279 tok->indstack[++tok->indent] = col;
1280 tok->altindstack[tok->indent] = altcol;
1281 }
1282 else /* col < tok->indstack[tok->indent] */ {
1283 /* Dedent -- any number, must be consistent */
1284 while (tok->indent > 0 &&
1285 col < tok->indstack[tok->indent]) {
1286 tok->pendin--;
1287 tok->indent--;
1288 }
1289 if (col != tok->indstack[tok->indent]) {
1290 tok->done = E_DEDENT;
1291 tok->cur = tok->inp;
1292 return ERRORTOKEN;
1293 }
1294 if (altcol != tok->altindstack[tok->indent]) {
1295 if (indenterror(tok))
1296 return ERRORTOKEN;
1297 }
1298 }
1299 }
1300 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001301
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001302 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001303
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001304 /* Return pending indents/dedents */
1305 if (tok->pendin != 0) {
1306 if (tok->pendin < 0) {
1307 tok->pendin++;
1308 return DEDENT;
1309 }
1310 else {
1311 tok->pendin--;
1312 return INDENT;
1313 }
1314 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001315
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 again:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001317 tok->start = NULL;
1318 /* Skip spaces */
1319 do {
1320 c = tok_nextc(tok);
1321 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001322
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001323 /* Set start of current token */
1324 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001325
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001326 /* Skip comment, while looking for tab-setting magic */
1327 if (c == '#') {
1328 static char *tabforms[] = {
1329 "tab-width:", /* Emacs */
1330 ":tabstop=", /* vim, full form */
1331 ":ts=", /* vim, abbreviated form */
1332 "set tabsize=", /* will vi never die? */
1333 /* more templates can be added here to support other editors */
1334 };
1335 char cbuf[80];
1336 char *tp, **cp;
1337 tp = cbuf;
1338 do {
1339 *tp++ = c = tok_nextc(tok);
1340 } while (c != EOF && c != '\n' &&
1341 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1342 *tp = '\0';
1343 for (cp = tabforms;
1344 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1345 cp++) {
1346 if ((tp = strstr(cbuf, *cp))) {
1347 int newsize = atoi(tp + strlen(*cp));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001348
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001349 if (newsize >= 1 && newsize <= 40) {
1350 tok->tabsize = newsize;
1351 if (Py_VerboseFlag)
1352 PySys_WriteStderr(
1353 "Tab size set to %d\n",
1354 newsize);
1355 }
1356 }
1357 }
1358 while (c != EOF && c != '\n')
1359 c = tok_nextc(tok);
1360 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001361
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001362 /* Check for EOF and errors now */
1363 if (c == EOF) {
1364 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1365 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001366
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001367 /* Identifier (most frequent token!) */
1368 if (Py_ISALPHA(c) || c == '_') {
1369 /* Process r"", u"" and ur"" */
1370 switch (c) {
1371 case 'b':
1372 case 'B':
1373 c = tok_nextc(tok);
1374 if (c == 'r' || c == 'R')
1375 c = tok_nextc(tok);
1376 if (c == '"' || c == '\'')
1377 goto letter_quote;
1378 break;
1379 case 'r':
1380 case 'R':
1381 c = tok_nextc(tok);
1382 if (c == '"' || c == '\'')
1383 goto letter_quote;
1384 break;
1385 case 'u':
1386 case 'U':
1387 c = tok_nextc(tok);
1388 if (c == 'r' || c == 'R')
1389 c = tok_nextc(tok);
1390 if (c == '"' || c == '\'')
1391 goto letter_quote;
1392 break;
1393 }
Stefan Krah3db41612010-06-24 09:33:05 +00001394 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001395 c = tok_nextc(tok);
1396 }
1397 tok_backup(tok, c);
1398 *p_start = tok->start;
1399 *p_end = tok->cur;
1400 return NAME;
1401 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001402
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001403 /* Newline */
1404 if (c == '\n') {
1405 tok->atbol = 1;
1406 if (blankline || tok->level > 0)
1407 goto nextline;
1408 *p_start = tok->start;
1409 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1410 tok->cont_line = 0;
1411 return NEWLINE;
1412 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001413
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001414 /* Period or number starting with period? */
1415 if (c == '.') {
1416 c = tok_nextc(tok);
1417 if (isdigit(c)) {
1418 goto fraction;
1419 }
1420 else {
1421 tok_backup(tok, c);
1422 *p_start = tok->start;
1423 *p_end = tok->cur;
1424 return DOT;
1425 }
1426 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001427
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001428 /* Number */
1429 if (isdigit(c)) {
1430 if (c == '0') {
1431 /* Hex, octal or binary -- maybe. */
1432 c = tok_nextc(tok);
1433 if (c == '.')
1434 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001435#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001436 if (c == 'j' || c == 'J')
1437 goto imaginary;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001438#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001439 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001440
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001441 /* Hex */
1442 c = tok_nextc(tok);
1443 if (!isxdigit(c)) {
1444 tok->done = E_TOKEN;
1445 tok_backup(tok, c);
1446 return ERRORTOKEN;
1447 }
1448 do {
1449 c = tok_nextc(tok);
1450 } while (isxdigit(c));
1451 }
1452 else if (c == 'o' || c == 'O') {
1453 /* Octal */
1454 c = tok_nextc(tok);
1455 if (c < '0' || c >= '8') {
1456 tok->done = E_TOKEN;
1457 tok_backup(tok, c);
1458 return ERRORTOKEN;
1459 }
1460 do {
1461 c = tok_nextc(tok);
1462 } while ('0' <= c && c < '8');
1463 }
1464 else if (c == 'b' || c == 'B') {
1465 /* Binary */
1466 c = tok_nextc(tok);
1467 if (c != '0' && c != '1') {
1468 tok->done = E_TOKEN;
1469 tok_backup(tok, c);
1470 return ERRORTOKEN;
1471 }
1472 do {
1473 c = tok_nextc(tok);
1474 } while (c == '0' || c == '1');
1475 }
1476 else {
1477 int found_decimal = 0;
1478 /* Octal; c is first char of it */
1479 /* There's no 'isoctdigit' macro, sigh */
1480 while ('0' <= c && c < '8') {
1481 c = tok_nextc(tok);
1482 }
1483 if (isdigit(c)) {
1484 found_decimal = 1;
1485 do {
1486 c = tok_nextc(tok);
1487 } while (isdigit(c));
1488 }
1489 if (c == '.')
1490 goto fraction;
1491 else if (c == 'e' || c == 'E')
1492 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001493#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001494 else if (c == 'j' || c == 'J')
1495 goto imaginary;
Tim Petersd507dab2001-08-30 20:51:59 +00001496#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001497 else if (found_decimal) {
1498 tok->done = E_TOKEN;
1499 tok_backup(tok, c);
1500 return ERRORTOKEN;
1501 }
1502 }
1503 if (c == 'l' || c == 'L')
1504 c = tok_nextc(tok);
1505 }
1506 else {
1507 /* Decimal */
1508 do {
1509 c = tok_nextc(tok);
1510 } while (isdigit(c));
1511 if (c == 'l' || c == 'L')
1512 c = tok_nextc(tok);
1513 else {
1514 /* Accept floating point numbers. */
1515 if (c == '.') {
1516 fraction:
1517 /* Fraction */
1518 do {
1519 c = tok_nextc(tok);
1520 } while (isdigit(c));
1521 }
1522 if (c == 'e' || c == 'E') {
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001523 int e;
1524 exponent:
1525 e = c;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001526 /* Exponent part */
1527 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001528 if (c == '+' || c == '-') {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001529 c = tok_nextc(tok);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001530 if (!isdigit(c)) {
1531 tok->done = E_TOKEN;
1532 tok_backup(tok, c);
1533 return ERRORTOKEN;
1534 }
1535 } else if (!isdigit(c)) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001536 tok_backup(tok, c);
Benjamin Peterson93e51aa2014-06-07 12:36:39 -07001537 tok_backup(tok, e);
1538 *p_start = tok->start;
1539 *p_end = tok->cur;
1540 return NUMBER;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001541 }
1542 do {
1543 c = tok_nextc(tok);
1544 } while (isdigit(c));
1545 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001546#ifndef WITHOUT_COMPLEX
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001547 if (c == 'j' || c == 'J')
1548 /* Imaginary part */
1549 imaginary:
1550 c = tok_nextc(tok);
Guido van Rossumf595fde1996-01-12 01:31:58 +00001551#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001552 }
1553 }
1554 tok_backup(tok, c);
1555 *p_start = tok->start;
1556 *p_end = tok->cur;
1557 return NUMBER;
1558 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001559
1560 letter_quote:
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001561 /* String */
1562 if (c == '\'' || c == '"') {
1563 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1564 int quote = c;
1565 int triple = 0;
1566 int tripcount = 0;
1567 for (;;) {
1568 c = tok_nextc(tok);
1569 if (c == '\n') {
1570 if (!triple) {
1571 tok->done = E_EOLS;
1572 tok_backup(tok, c);
1573 return ERRORTOKEN;
1574 }
1575 tripcount = 0;
1576 tok->cont_line = 1; /* multiline string. */
1577 }
1578 else if (c == EOF) {
1579 if (triple)
1580 tok->done = E_EOFS;
1581 else
1582 tok->done = E_EOLS;
1583 tok->cur = tok->inp;
1584 return ERRORTOKEN;
1585 }
1586 else if (c == quote) {
1587 tripcount++;
1588 if (tok->cur - tok->start == quote2) {
1589 c = tok_nextc(tok);
1590 if (c == quote) {
1591 triple = 1;
1592 tripcount = 0;
1593 continue;
1594 }
1595 tok_backup(tok, c);
1596 }
1597 if (!triple || tripcount == 3)
1598 break;
1599 }
1600 else if (c == '\\') {
1601 tripcount = 0;
1602 c = tok_nextc(tok);
1603 if (c == EOF) {
1604 tok->done = E_EOLS;
1605 tok->cur = tok->inp;
1606 return ERRORTOKEN;
1607 }
1608 }
1609 else
1610 tripcount = 0;
1611 }
1612 *p_start = tok->start;
1613 *p_end = tok->cur;
1614 return STRING;
1615 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001616
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001617 /* Line continuation */
1618 if (c == '\\') {
1619 c = tok_nextc(tok);
1620 if (c != '\n') {
1621 tok->done = E_LINECONT;
1622 tok->cur = tok->inp;
1623 return ERRORTOKEN;
1624 }
1625 tok->cont_line = 1;
1626 goto again; /* Read next line */
1627 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001628
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001629 /* Check for two-character token */
1630 {
1631 int c2 = tok_nextc(tok);
1632 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001633#ifndef PGEN
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001634 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1635 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1636 "<> not supported in 3.x; use !=",
1637 tok->filename, tok->lineno,
1638 NULL, NULL)) {
Serhiy Storchakad5e75562018-05-31 07:35:39 +03001639 tok->done = E_ERROR;
1640 tok->cur = tok->inp;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001641 return ERRORTOKEN;
1642 }
1643 }
Christian Heimes02c9ab52007-11-23 12:12:02 +00001644#endif
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001645 if (token != OP) {
1646 int c3 = tok_nextc(tok);
1647 int token3 = PyToken_ThreeChars(c, c2, c3);
1648 if (token3 != OP) {
1649 token = token3;
1650 } else {
1651 tok_backup(tok, c3);
1652 }
1653 *p_start = tok->start;
1654 *p_end = tok->cur;
1655 return token;
1656 }
1657 tok_backup(tok, c2);
1658 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001659
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001660 /* Keep track of parentheses nesting level */
1661 switch (c) {
1662 case '(':
1663 case '[':
1664 case '{':
1665 tok->level++;
1666 break;
1667 case ')':
1668 case ']':
1669 case '}':
1670 tok->level--;
1671 break;
1672 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001673
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001674 /* Punctuation character */
1675 *p_start = tok->start;
1676 *p_end = tok->cur;
1677 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001678}
1679
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001680int
1681PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1682{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001683 int result = tok_get(tok, p_start, p_end);
1684 if (tok->decoding_erred) {
1685 result = ERRORTOKEN;
1686 tok->done = E_DECODE;
1687 }
1688 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001689}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001690
Martin v. Löwisa5136192007-09-04 14:19:28 +00001691/* This function is only called from parsetok. However, it cannot live
1692 there, as it must be empty for PGEN, and we can check for PGEN only
1693 in this file. */
1694
Christian Heimes082c9b02008-01-23 14:20:50 +00001695#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001696char*
1697PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1698{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001699 return NULL;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001700}
1701#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001702#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001703static PyObject *
1704dec_utf8(const char *enc, const char *text, size_t len) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001705 PyObject *ret = NULL;
1706 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1707 if (unicode_text) {
1708 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1709 Py_DECREF(unicode_text);
1710 }
1711 if (!ret) {
1712 PyErr_Clear();
1713 }
1714 return ret;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001715}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001716char *
1717PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1718{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001719 char *text = NULL;
1720 if (tok->encoding) {
1721 /* convert source to original encondig */
1722 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1723 if (lineobj != NULL) {
1724 int linelen = PyString_Size(lineobj);
1725 const char *line = PyString_AsString(lineobj);
1726 text = PyObject_MALLOC(linelen + 1);
1727 if (text != NULL && line != NULL) {
1728 if (linelen)
1729 strncpy(text, line, linelen);
1730 text[linelen] = '\0';
1731 }
1732 Py_DECREF(lineobj);
1733
1734 /* adjust error offset */
1735 if (*offset > 1) {
1736 PyObject *offsetobj = dec_utf8(tok->encoding,
1737 tok->buf, *offset-1);
1738 if (offsetobj) {
1739 *offset = PyString_Size(offsetobj) + 1;
1740 Py_DECREF(offsetobj);
1741 }
1742 }
1743
1744 }
1745 }
1746 return text;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001747
1748}
Georg Brandl76b30d12008-01-07 18:41:34 +00001749#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001750#endif
1751
Martin v. Löwisa5136192007-09-04 14:19:28 +00001752
Guido van Rossum408027e1996-12-30 16:17:54 +00001753#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001754
1755void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001756tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001757{
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001758 printf("%s", _PyParser_TokenNames[type]);
1759 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1760 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001761}
1762
1763#endif