blob: f4d7e3fc683218b5e942c67358cd13a3dc900218 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Victor Stinner22a351a2010-10-14 12:04:34 +0000465 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 io = PyImport_ImportModuleNoBlock("io");
468 if (io == NULL)
469 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000470
Victor Stinner22a351a2010-10-14 12:04:34 +0000471 fd = fileno(tok->fp);
472 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
473 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
474 goto cleanup;
475 }
476
477 stream = PyObject_CallMethod(io, "open", "isisOOO",
478 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 if (stream == NULL)
480 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 Py_XDECREF(tok->decoding_readline);
483 readline = PyObject_GetAttrString(stream, "readline");
484 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 /* The file has been reopened; parsing will restart from
487 * the beginning of the file, we have to reset the line number.
488 * But this function has been called from inside tok_nextc() which
489 * will increment lineno before it returns. So we set it -1 so that
490 * the next call to tok_nextc() will start with tok->lineno == 0.
491 */
492 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000493
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000494 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(stream);
496 Py_XDECREF(io);
497 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498}
499
500/* Fetch the next byte from TOK. */
501
502static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504}
505
506/* Unfetch the last byte back into TOK. */
507
508static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510}
511
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000512/* Check whether the characters at s start a valid
513 UTF-8 sequence. Return the number of characters forming
514 the sequence if yes, 0 if not. */
515static int valid_utf8(const unsigned char* s)
516{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 int expected = 0;
518 int length;
519 if (*s < 0x80)
520 /* single-byte code */
521 return 1;
522 if (*s < 0xc0)
523 /* following byte */
524 return 0;
525 if (*s < 0xE0)
526 expected = 1;
527 else if (*s < 0xF0)
528 expected = 2;
529 else if (*s < 0xF8)
530 expected = 3;
531 else
532 return 0;
533 length = expected + 1;
534 for (; expected; expected--)
535 if (s[expected] < 0x80 || s[expected] >= 0xC0)
536 return 0;
537 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000538}
539
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540/* Read a line of input from TOK. Determine encoding
541 if necessary. */
542
543static char *
544decoding_fgets(char *s, int size, struct tok_state *tok)
545{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 char *line = NULL;
547 int badchar = 0;
548 for (;;) {
549 if (tok->decoding_state == STATE_NORMAL) {
550 /* We already have a codec associated with
551 this input. */
552 line = fp_readl(s, size, tok);
553 break;
554 } else if (tok->decoding_state == STATE_RAW) {
555 /* We want a 'raw' read. */
556 line = Py_UniversalNewlineFgets(s, size,
557 tok->fp, NULL);
558 break;
559 } else {
560 /* We have not yet determined the encoding.
561 If an encoding is found, use the file-pointer
562 reader functions from now on. */
563 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
564 return error_ret(tok);
565 assert(tok->decoding_state != STATE_INIT);
566 }
567 }
568 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
569 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
570 return error_ret(tok);
571 }
572 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000574 /* The default encoding is UTF-8, so make sure we don't have any
575 non-UTF-8 sequences in it. */
576 if (line && !tok->encoding) {
577 unsigned char *c;
578 int length;
579 for (c = (unsigned char *)line; *c; c += length)
580 if (!(length = valid_utf8(c))) {
581 badchar = *c;
582 break;
583 }
584 }
585 if (badchar) {
586 /* Need to add 1 to the line number, since this line
587 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200588 PyErr_Format(PyExc_SyntaxError,
589 "Non-UTF-8 code starting with '\\x%.2x' "
590 "in file %U on line %i, "
591 "but no encoding declared; "
592 "see http://python.org/dev/peps/pep-0263/ for details",
593 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000594 return error_ret(tok);
595 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000597 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598}
599
600static int
601decoding_feof(struct tok_state *tok)
602{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000603 if (tok->decoding_state != STATE_NORMAL) {
604 return feof(tok->fp);
605 } else {
606 PyObject* buf = tok->decoding_buffer;
607 if (buf == NULL) {
608 buf = PyObject_CallObject(tok->decoding_readline, NULL);
609 if (buf == NULL) {
610 error_ret(tok);
611 return 1;
612 } else {
613 tok->decoding_buffer = buf;
614 }
615 }
616 return PyObject_Length(buf) == 0;
617 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
620/* Fetch a byte from TOK, using the string buffer. */
621
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000622static int
623buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625}
626
627/* Unfetch a byte from TOK, using the string buffer. */
628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000629static void
630buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 tok->str--;
632 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000633}
634
635/* Set the readline function for TOK to ENC. For the string-based
636 tokenizer, this means to just record the encoding. */
637
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000638static int
639buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 tok->enc = enc;
641 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642}
643
644/* Return a UTF-8 encoding Python string object from the
645 C byte string STR, which is encoded with ENC. */
646
647static PyObject *
648translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 PyObject *utf8;
650 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
651 if (buf == NULL)
652 return NULL;
653 utf8 = PyUnicode_AsUTF8String(buf);
654 Py_DECREF(buf);
655 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656}
657
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000658
659static char *
660translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
662 char *buf, *current;
663 char c = '\0';
664 buf = PyMem_MALLOC(needed_length);
665 if (buf == NULL) {
666 tok->done = E_NOMEM;
667 return NULL;
668 }
669 for (current = buf; *s; s++, current++) {
670 c = *s;
671 if (skip_next_lf) {
672 skip_next_lf = 0;
673 if (c == '\n') {
674 c = *++s;
675 if (!c)
676 break;
677 }
678 }
679 if (c == '\r') {
680 skip_next_lf = 1;
681 c = '\n';
682 }
683 *current = c;
684 }
685 /* If this is exec input, add a newline to the end of the string if
686 there isn't one already. */
687 if (exec_input && c != '\n') {
688 *current = '\n';
689 current++;
690 }
691 *current = '\0';
692 final_length = current - buf + 1;
693 if (final_length < needed_length && final_length)
694 /* should never fail */
695 buf = PyMem_REALLOC(buf, final_length);
696 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000697}
698
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699/* Decode a byte string STR for use as the buffer of TOK.
700 Look for encoding declarations inside STR, and record them
701 inside TOK. */
702
703static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 PyObject* utf8 = NULL;
707 const char *str;
708 const char *s;
709 const char *newl[2] = {NULL, NULL};
710 int lineno = 0;
711 tok->input = str = translate_newlines(input, single, tok);
712 if (str == NULL)
713 return NULL;
714 tok->enc = NULL;
715 tok->str = str;
716 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
717 return error_ret(tok);
718 str = tok->str; /* string after BOM if any */
719 assert(str);
720 if (tok->enc != NULL) {
721 utf8 = translate_into_utf8(str, tok->enc);
722 if (utf8 == NULL)
723 return error_ret(tok);
724 str = PyBytes_AsString(utf8);
725 }
726 for (s = str;; s++) {
727 if (*s == '\0') break;
728 else if (*s == '\n') {
729 assert(lineno < 2);
730 newl[lineno] = s;
731 lineno++;
732 if (lineno == 2) break;
733 }
734 }
735 tok->enc = NULL;
736 /* need to check line 1 and 2 separately since check_coding_spec
737 assumes a single line as input */
738 if (newl[0]) {
739 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
740 return error_ret(tok);
741 if (tok->enc == NULL && newl[1]) {
742 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
743 tok, buf_setreadl))
744 return error_ret(tok);
745 }
746 }
747 if (tok->enc != NULL) {
748 assert(utf8 == NULL);
749 utf8 = translate_into_utf8(str, tok->enc);
750 if (utf8 == NULL)
751 return error_ret(tok);
752 str = PyBytes_AS_STRING(utf8);
753 }
754 assert(tok->decoding_buffer == NULL);
755 tok->decoding_buffer = utf8; /* CAUTION */
756 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000757}
758
759#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760
761/* Set up tokenizer for string */
762
763struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000764PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 struct tok_state *tok = tok_new();
767 if (tok == NULL)
768 return NULL;
769 str = (char *)decode_str(str, exec_input, tok);
770 if (str == NULL) {
771 PyTokenizer_Free(tok);
772 return NULL;
773 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000774
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 /* XXX: constify members. */
776 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
777 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778}
779
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000780struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000781PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000782{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 struct tok_state *tok = tok_new();
784 if (tok == NULL)
785 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000786#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000788#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 if (str == NULL) {
790 PyTokenizer_Free(tok);
791 return NULL;
792 }
793 tok->decoding_state = STATE_RAW;
794 tok->read_coding_spec = 1;
795 tok->enc = NULL;
796 tok->str = str;
797 tok->encoding = (char *)PyMem_MALLOC(6);
798 if (!tok->encoding) {
799 PyTokenizer_Free(tok);
800 return NULL;
801 }
802 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000803
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 /* XXX: constify members. */
805 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
806 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000807}
808
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000809/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810
811struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000812PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 struct tok_state *tok = tok_new();
815 if (tok == NULL)
816 return NULL;
817 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
818 PyTokenizer_Free(tok);
819 return NULL;
820 }
821 tok->cur = tok->inp = tok->buf;
822 tok->end = tok->buf + BUFSIZ;
823 tok->fp = fp;
824 tok->prompt = ps1;
825 tok->nextprompt = ps2;
826 if (enc != NULL) {
827 /* Must copy encoding declaration since it
828 gets copied into the parse tree. */
829 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
830 if (!tok->encoding) {
831 PyTokenizer_Free(tok);
832 return NULL;
833 }
834 strcpy(tok->encoding, enc);
835 tok->decoding_state = STATE_NORMAL;
836 }
837 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838}
839
840
841/* Free a tok_state structure */
842
843void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000844PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 if (tok->encoding != NULL)
847 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000848#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 Py_XDECREF(tok->decoding_readline);
850 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200851 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000852#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 if (tok->fp != NULL && tok->buf != NULL)
854 PyMem_FREE(tok->buf);
855 if (tok->input)
856 PyMem_FREE((char *)tok->input);
857 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858}
859
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860/* Get next char, updating state; error code goes into tok->done */
861
862static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000863tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000864{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865 for (;;) {
866 if (tok->cur != tok->inp) {
867 return Py_CHARMASK(*tok->cur++); /* Fast path */
868 }
869 if (tok->done != E_OK)
870 return EOF;
871 if (tok->fp == NULL) {
872 char *end = strchr(tok->inp, '\n');
873 if (end != NULL)
874 end++;
875 else {
876 end = strchr(tok->inp, '\0');
877 if (end == tok->inp) {
878 tok->done = E_EOF;
879 return EOF;
880 }
881 }
882 if (tok->start == NULL)
883 tok->buf = tok->cur;
884 tok->line_start = tok->cur;
885 tok->lineno++;
886 tok->inp = end;
887 return Py_CHARMASK(*tok->cur++);
888 }
889 if (tok->prompt != NULL) {
890 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000891#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000892 if (newtok != NULL) {
893 char *translated = translate_newlines(newtok, 0, tok);
894 PyMem_FREE(newtok);
895 if (translated == NULL)
896 return EOF;
897 newtok = translated;
898 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 if (tok->encoding && newtok && *newtok) {
900 /* Recode to UTF-8 */
901 Py_ssize_t buflen;
902 const char* buf;
903 PyObject *u = translate_into_utf8(newtok, tok->encoding);
904 PyMem_FREE(newtok);
905 if (!u) {
906 tok->done = E_DECODE;
907 return EOF;
908 }
909 buflen = PyBytes_GET_SIZE(u);
910 buf = PyBytes_AS_STRING(u);
911 if (!buf) {
912 Py_DECREF(u);
913 tok->done = E_DECODE;
914 return EOF;
915 }
916 newtok = PyMem_MALLOC(buflen+1);
917 strcpy(newtok, buf);
918 Py_DECREF(u);
919 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000920#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921 if (tok->nextprompt != NULL)
922 tok->prompt = tok->nextprompt;
923 if (newtok == NULL)
924 tok->done = E_INTR;
925 else if (*newtok == '\0') {
926 PyMem_FREE(newtok);
927 tok->done = E_EOF;
928 }
929 else if (tok->start != NULL) {
930 size_t start = tok->start - tok->buf;
931 size_t oldlen = tok->cur - tok->buf;
932 size_t newlen = oldlen + strlen(newtok);
933 char *buf = tok->buf;
934 buf = (char *)PyMem_REALLOC(buf, newlen+1);
935 tok->lineno++;
936 if (buf == NULL) {
937 PyMem_FREE(tok->buf);
938 tok->buf = NULL;
939 PyMem_FREE(newtok);
940 tok->done = E_NOMEM;
941 return EOF;
942 }
943 tok->buf = buf;
944 tok->cur = tok->buf + oldlen;
945 tok->line_start = tok->cur;
946 strcpy(tok->buf + oldlen, newtok);
947 PyMem_FREE(newtok);
948 tok->inp = tok->buf + newlen;
949 tok->end = tok->inp + 1;
950 tok->start = tok->buf + start;
951 }
952 else {
953 tok->lineno++;
954 if (tok->buf != NULL)
955 PyMem_FREE(tok->buf);
956 tok->buf = newtok;
957 tok->line_start = tok->buf;
958 tok->cur = tok->buf;
959 tok->line_start = tok->buf;
960 tok->inp = strchr(tok->buf, '\0');
961 tok->end = tok->inp + 1;
962 }
963 }
964 else {
965 int done = 0;
966 Py_ssize_t cur = 0;
967 char *pt;
968 if (tok->start == NULL) {
969 if (tok->buf == NULL) {
970 tok->buf = (char *)
971 PyMem_MALLOC(BUFSIZ);
972 if (tok->buf == NULL) {
973 tok->done = E_NOMEM;
974 return EOF;
975 }
976 tok->end = tok->buf + BUFSIZ;
977 }
978 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
979 tok) == NULL) {
980 tok->done = E_EOF;
981 done = 1;
982 }
983 else {
984 tok->done = E_OK;
985 tok->inp = strchr(tok->buf, '\0');
986 done = tok->inp[-1] == '\n';
987 }
988 }
989 else {
990 cur = tok->cur - tok->buf;
991 if (decoding_feof(tok)) {
992 tok->done = E_EOF;
993 done = 1;
994 }
995 else
996 tok->done = E_OK;
997 }
998 tok->lineno++;
999 /* Read until '\n' or EOF */
1000 while (!done) {
1001 Py_ssize_t curstart = tok->start == NULL ? -1 :
1002 tok->start - tok->buf;
1003 Py_ssize_t curvalid = tok->inp - tok->buf;
1004 Py_ssize_t newsize = curvalid + BUFSIZ;
1005 char *newbuf = tok->buf;
1006 newbuf = (char *)PyMem_REALLOC(newbuf,
1007 newsize);
1008 if (newbuf == NULL) {
1009 tok->done = E_NOMEM;
1010 tok->cur = tok->inp;
1011 return EOF;
1012 }
1013 tok->buf = newbuf;
1014 tok->inp = tok->buf + curvalid;
1015 tok->end = tok->buf + newsize;
1016 tok->start = curstart < 0 ? NULL :
1017 tok->buf + curstart;
1018 if (decoding_fgets(tok->inp,
1019 (int)(tok->end - tok->inp),
1020 tok) == NULL) {
1021 /* Break out early on decoding
1022 errors, as tok->buf will be NULL
1023 */
1024 if (tok->decoding_erred)
1025 return EOF;
1026 /* Last line does not end in \n,
1027 fake one */
1028 strcpy(tok->inp, "\n");
1029 }
1030 tok->inp = strchr(tok->inp, '\0');
1031 done = tok->inp[-1] == '\n';
1032 }
1033 if (tok->buf != NULL) {
1034 tok->cur = tok->buf + cur;
1035 tok->line_start = tok->cur;
1036 /* replace "\r\n" with "\n" */
1037 /* For Mac leave the \r, giving a syntax error */
1038 pt = tok->inp - 2;
1039 if (pt >= tok->buf && *pt == '\r') {
1040 *pt++ = '\n';
1041 *pt = '\0';
1042 tok->inp = pt;
1043 }
1044 }
1045 }
1046 if (tok->done != E_OK) {
1047 if (tok->prompt != NULL)
1048 PySys_WriteStderr("\n");
1049 tok->cur = tok->inp;
1050 return EOF;
1051 }
1052 }
1053 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054}
1055
1056
1057/* Back-up one character */
1058
1059static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001060tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 if (c != EOF) {
1063 if (--tok->cur < tok->buf)
1064 Py_FatalError("tok_backup: beginning of buffer");
1065 if (*tok->cur != c)
1066 *tok->cur = c;
1067 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001068}
1069
1070
1071/* Return the token corresponding to a single character */
1072
1073int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001074PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001075{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001076 switch (c) {
1077 case '(': return LPAR;
1078 case ')': return RPAR;
1079 case '[': return LSQB;
1080 case ']': return RSQB;
1081 case ':': return COLON;
1082 case ',': return COMMA;
1083 case ';': return SEMI;
1084 case '+': return PLUS;
1085 case '-': return MINUS;
1086 case '*': return STAR;
1087 case '/': return SLASH;
1088 case '|': return VBAR;
1089 case '&': return AMPER;
1090 case '<': return LESS;
1091 case '>': return GREATER;
1092 case '=': return EQUAL;
1093 case '.': return DOT;
1094 case '%': return PERCENT;
1095 case '{': return LBRACE;
1096 case '}': return RBRACE;
1097 case '^': return CIRCUMFLEX;
1098 case '~': return TILDE;
1099 case '@': return AT;
1100 default: return OP;
1101 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102}
1103
1104
Guido van Rossumfbab9051991-10-20 20:25:03 +00001105int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001106PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001107{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 switch (c1) {
1109 case '=':
1110 switch (c2) {
1111 case '=': return EQEQUAL;
1112 }
1113 break;
1114 case '!':
1115 switch (c2) {
1116 case '=': return NOTEQUAL;
1117 }
1118 break;
1119 case '<':
1120 switch (c2) {
1121 case '>': return NOTEQUAL;
1122 case '=': return LESSEQUAL;
1123 case '<': return LEFTSHIFT;
1124 }
1125 break;
1126 case '>':
1127 switch (c2) {
1128 case '=': return GREATEREQUAL;
1129 case '>': return RIGHTSHIFT;
1130 }
1131 break;
1132 case '+':
1133 switch (c2) {
1134 case '=': return PLUSEQUAL;
1135 }
1136 break;
1137 case '-':
1138 switch (c2) {
1139 case '=': return MINEQUAL;
1140 case '>': return RARROW;
1141 }
1142 break;
1143 case '*':
1144 switch (c2) {
1145 case '*': return DOUBLESTAR;
1146 case '=': return STAREQUAL;
1147 }
1148 break;
1149 case '/':
1150 switch (c2) {
1151 case '/': return DOUBLESLASH;
1152 case '=': return SLASHEQUAL;
1153 }
1154 break;
1155 case '|':
1156 switch (c2) {
1157 case '=': return VBAREQUAL;
1158 }
1159 break;
1160 case '%':
1161 switch (c2) {
1162 case '=': return PERCENTEQUAL;
1163 }
1164 break;
1165 case '&':
1166 switch (c2) {
1167 case '=': return AMPEREQUAL;
1168 }
1169 break;
1170 case '^':
1171 switch (c2) {
1172 case '=': return CIRCUMFLEXEQUAL;
1173 }
1174 break;
1175 }
1176 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001177}
1178
Thomas Wouters434d0822000-08-24 20:11:32 +00001179int
1180PyToken_ThreeChars(int c1, int c2, int c3)
1181{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001182 switch (c1) {
1183 case '<':
1184 switch (c2) {
1185 case '<':
1186 switch (c3) {
1187 case '=':
1188 return LEFTSHIFTEQUAL;
1189 }
1190 break;
1191 }
1192 break;
1193 case '>':
1194 switch (c2) {
1195 case '>':
1196 switch (c3) {
1197 case '=':
1198 return RIGHTSHIFTEQUAL;
1199 }
1200 break;
1201 }
1202 break;
1203 case '*':
1204 switch (c2) {
1205 case '*':
1206 switch (c3) {
1207 case '=':
1208 return DOUBLESTAREQUAL;
1209 }
1210 break;
1211 }
1212 break;
1213 case '/':
1214 switch (c2) {
1215 case '/':
1216 switch (c3) {
1217 case '=':
1218 return DOUBLESLASHEQUAL;
1219 }
1220 break;
1221 }
1222 break;
1223 case '.':
1224 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001225 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 switch (c3) {
1227 case '.':
1228 return ELLIPSIS;
1229 }
1230 break;
1231 }
1232 break;
1233 }
1234 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001235}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001236
Guido van Rossum926f13a1998-04-09 21:38:06 +00001237static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001238indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001239{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 if (tok->alterror) {
1241 tok->done = E_TABSPACE;
1242 tok->cur = tok->inp;
1243 return 1;
1244 }
1245 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001246#ifdef PGEN
1247 PySys_WriteStderr("inconsistent use of tabs and spaces "
1248 "in indentation\n");
1249#else
1250 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001252#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 tok->altwarning = 0;
1254 }
1255 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001256}
1257
Martin v. Löwis47383402007-08-15 07:32:56 +00001258#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001259#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001260#else
1261/* Verify that the identifier follows PEP 3131. */
1262static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001263verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 PyObject *s;
1266 int result;
1267 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1268 if (s == NULL) {
1269 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1270 PyErr_Clear();
1271 tok->done = E_IDENTIFIER;
1272 } else {
1273 tok->done = E_ERROR;
1274 }
1275 return 0;
1276 }
1277 result = PyUnicode_IsIdentifier(s);
1278 Py_DECREF(s);
1279 if (result == 0)
1280 tok->done = E_IDENTIFIER;
1281 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001282}
1283#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001284
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285/* Get next token, after space stripping etc. */
1286
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001287static int
1288tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 register int c;
1291 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001292
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001294 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 tok->start = NULL;
1296 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001297
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 /* Get indentation level */
1299 if (tok->atbol) {
1300 register int col = 0;
1301 register int altcol = 0;
1302 tok->atbol = 0;
1303 for (;;) {
1304 c = tok_nextc(tok);
1305 if (c == ' ')
1306 col++, altcol++;
1307 else if (c == '\t') {
1308 col = (col/tok->tabsize + 1) * tok->tabsize;
1309 altcol = (altcol/tok->alttabsize + 1)
1310 * tok->alttabsize;
1311 }
1312 else if (c == '\014') /* Control-L (formfeed) */
1313 col = altcol = 0; /* For Emacs users */
1314 else
1315 break;
1316 }
1317 tok_backup(tok, c);
1318 if (c == '#' || c == '\n') {
1319 /* Lines with only whitespace and/or comments
1320 shouldn't affect the indentation and are
1321 not passed to the parser as NEWLINE tokens,
1322 except *totally* empty lines in interactive
1323 mode, which signal the end of a command group. */
1324 if (col == 0 && c == '\n' && tok->prompt != NULL)
1325 blankline = 0; /* Let it through */
1326 else
1327 blankline = 1; /* Ignore completely */
1328 /* We can't jump back right here since we still
1329 may need to skip to the end of a comment */
1330 }
1331 if (!blankline && tok->level == 0) {
1332 if (col == tok->indstack[tok->indent]) {
1333 /* No change */
1334 if (altcol != tok->altindstack[tok->indent]) {
1335 if (indenterror(tok))
1336 return ERRORTOKEN;
1337 }
1338 }
1339 else if (col > tok->indstack[tok->indent]) {
1340 /* Indent -- always one */
1341 if (tok->indent+1 >= MAXINDENT) {
1342 tok->done = E_TOODEEP;
1343 tok->cur = tok->inp;
1344 return ERRORTOKEN;
1345 }
1346 if (altcol <= tok->altindstack[tok->indent]) {
1347 if (indenterror(tok))
1348 return ERRORTOKEN;
1349 }
1350 tok->pendin++;
1351 tok->indstack[++tok->indent] = col;
1352 tok->altindstack[tok->indent] = altcol;
1353 }
1354 else /* col < tok->indstack[tok->indent] */ {
1355 /* Dedent -- any number, must be consistent */
1356 while (tok->indent > 0 &&
1357 col < tok->indstack[tok->indent]) {
1358 tok->pendin--;
1359 tok->indent--;
1360 }
1361 if (col != tok->indstack[tok->indent]) {
1362 tok->done = E_DEDENT;
1363 tok->cur = tok->inp;
1364 return ERRORTOKEN;
1365 }
1366 if (altcol != tok->altindstack[tok->indent]) {
1367 if (indenterror(tok))
1368 return ERRORTOKEN;
1369 }
1370 }
1371 }
1372 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001373
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001375
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 /* Return pending indents/dedents */
1377 if (tok->pendin != 0) {
1378 if (tok->pendin < 0) {
1379 tok->pendin++;
1380 return DEDENT;
1381 }
1382 else {
1383 tok->pendin--;
1384 return INDENT;
1385 }
1386 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001387
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 tok->start = NULL;
1390 /* Skip spaces */
1391 do {
1392 c = tok_nextc(tok);
1393 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001394
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 /* Set start of current token */
1396 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001398 /* Skip comment */
1399 if (c == '#')
1400 while (c != EOF && c != '\n')
1401 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 /* Check for EOF and errors now */
1404 if (c == EOF) {
1405 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1406 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 /* Identifier (most frequent token!) */
1409 nonascii = 0;
1410 if (is_potential_identifier_start(c)) {
1411 /* Process b"", r"" and br"" */
1412 if (c == 'b' || c == 'B') {
1413 c = tok_nextc(tok);
1414 if (c == '"' || c == '\'')
1415 goto letter_quote;
1416 }
1417 if (c == 'r' || c == 'R') {
1418 c = tok_nextc(tok);
1419 if (c == '"' || c == '\'')
1420 goto letter_quote;
1421 }
1422 while (is_potential_identifier_char(c)) {
1423 if (c >= 128)
1424 nonascii = 1;
1425 c = tok_nextc(tok);
1426 }
1427 tok_backup(tok, c);
1428 if (nonascii &&
1429 !verify_identifier(tok)) {
1430 tok->done = E_IDENTIFIER;
1431 return ERRORTOKEN;
1432 }
1433 *p_start = tok->start;
1434 *p_end = tok->cur;
1435 return NAME;
1436 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 /* Newline */
1439 if (c == '\n') {
1440 tok->atbol = 1;
1441 if (blankline || tok->level > 0)
1442 goto nextline;
1443 *p_start = tok->start;
1444 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1445 tok->cont_line = 0;
1446 return NEWLINE;
1447 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 /* Period or number starting with period? */
1450 if (c == '.') {
1451 c = tok_nextc(tok);
1452 if (isdigit(c)) {
1453 goto fraction;
1454 } else if (c == '.') {
1455 c = tok_nextc(tok);
1456 if (c == '.') {
1457 *p_start = tok->start;
1458 *p_end = tok->cur;
1459 return ELLIPSIS;
1460 } else {
1461 tok_backup(tok, c);
1462 }
1463 tok_backup(tok, '.');
1464 } else {
1465 tok_backup(tok, c);
1466 }
1467 *p_start = tok->start;
1468 *p_end = tok->cur;
1469 return DOT;
1470 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001471
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 /* Number */
1473 if (isdigit(c)) {
1474 if (c == '0') {
1475 /* Hex, octal or binary -- maybe. */
1476 c = tok_nextc(tok);
1477 if (c == '.')
1478 goto fraction;
1479 if (c == 'j' || c == 'J')
1480 goto imaginary;
1481 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001482
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 /* Hex */
1484 c = tok_nextc(tok);
1485 if (!isxdigit(c)) {
1486 tok->done = E_TOKEN;
1487 tok_backup(tok, c);
1488 return ERRORTOKEN;
1489 }
1490 do {
1491 c = tok_nextc(tok);
1492 } while (isxdigit(c));
1493 }
1494 else if (c == 'o' || c == 'O') {
1495 /* Octal */
1496 c = tok_nextc(tok);
1497 if (c < '0' || c >= '8') {
1498 tok->done = E_TOKEN;
1499 tok_backup(tok, c);
1500 return ERRORTOKEN;
1501 }
1502 do {
1503 c = tok_nextc(tok);
1504 } while ('0' <= c && c < '8');
1505 }
1506 else if (c == 'b' || c == 'B') {
1507 /* Binary */
1508 c = tok_nextc(tok);
1509 if (c != '0' && c != '1') {
1510 tok->done = E_TOKEN;
1511 tok_backup(tok, c);
1512 return ERRORTOKEN;
1513 }
1514 do {
1515 c = tok_nextc(tok);
1516 } while (c == '0' || c == '1');
1517 }
1518 else {
1519 int nonzero = 0;
1520 /* maybe old-style octal; c is first char of it */
1521 /* in any case, allow '0' as a literal */
1522 while (c == '0')
1523 c = tok_nextc(tok);
1524 while (isdigit(c)) {
1525 nonzero = 1;
1526 c = tok_nextc(tok);
1527 }
1528 if (c == '.')
1529 goto fraction;
1530 else if (c == 'e' || c == 'E')
1531 goto exponent;
1532 else if (c == 'j' || c == 'J')
1533 goto imaginary;
1534 else if (nonzero) {
1535 tok->done = E_TOKEN;
1536 tok_backup(tok, c);
1537 return ERRORTOKEN;
1538 }
1539 }
1540 }
1541 else {
1542 /* Decimal */
1543 do {
1544 c = tok_nextc(tok);
1545 } while (isdigit(c));
1546 {
1547 /* Accept floating point numbers. */
1548 if (c == '.') {
1549 fraction:
1550 /* Fraction */
1551 do {
1552 c = tok_nextc(tok);
1553 } while (isdigit(c));
1554 }
1555 if (c == 'e' || c == 'E') {
1556 exponent:
1557 /* Exponent part */
1558 c = tok_nextc(tok);
1559 if (c == '+' || c == '-')
1560 c = tok_nextc(tok);
1561 if (!isdigit(c)) {
1562 tok->done = E_TOKEN;
1563 tok_backup(tok, c);
1564 return ERRORTOKEN;
1565 }
1566 do {
1567 c = tok_nextc(tok);
1568 } while (isdigit(c));
1569 }
1570 if (c == 'j' || c == 'J')
1571 /* Imaginary part */
1572 imaginary:
1573 c = tok_nextc(tok);
1574 }
1575 }
1576 tok_backup(tok, c);
1577 *p_start = tok->start;
1578 *p_end = tok->cur;
1579 return NUMBER;
1580 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001581
1582 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001583 /* String */
1584 if (c == '\'' || c == '"') {
1585 int quote = c;
1586 int quote_size = 1; /* 1 or 3 */
1587 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001588
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 /* Find the quote size and start of string */
1590 c = tok_nextc(tok);
1591 if (c == quote) {
1592 c = tok_nextc(tok);
1593 if (c == quote)
1594 quote_size = 3;
1595 else
1596 end_quote_size = 1; /* empty string found */
1597 }
1598 if (c != quote)
1599 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001600
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 /* Get rest of string */
1602 while (end_quote_size != quote_size) {
1603 c = tok_nextc(tok);
1604 if (c == EOF) {
1605 if (quote_size == 3)
1606 tok->done = E_EOFS;
1607 else
1608 tok->done = E_EOLS;
1609 tok->cur = tok->inp;
1610 return ERRORTOKEN;
1611 }
1612 if (quote_size == 1 && c == '\n') {
1613 tok->done = E_EOLS;
1614 tok->cur = tok->inp;
1615 return ERRORTOKEN;
1616 }
1617 if (c == quote)
1618 end_quote_size += 1;
1619 else {
1620 end_quote_size = 0;
1621 if (c == '\\')
1622 c = tok_nextc(tok); /* skip escaped char */
1623 }
1624 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001625
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 *p_start = tok->start;
1627 *p_end = tok->cur;
1628 return STRING;
1629 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001630
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 /* Line continuation */
1632 if (c == '\\') {
1633 c = tok_nextc(tok);
1634 if (c != '\n') {
1635 tok->done = E_LINECONT;
1636 tok->cur = tok->inp;
1637 return ERRORTOKEN;
1638 }
1639 tok->cont_line = 1;
1640 goto again; /* Read next line */
1641 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001642
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 /* Check for two-character token */
1644 {
1645 int c2 = tok_nextc(tok);
1646 int token = PyToken_TwoChars(c, c2);
1647 if (token != OP) {
1648 int c3 = tok_nextc(tok);
1649 int token3 = PyToken_ThreeChars(c, c2, c3);
1650 if (token3 != OP) {
1651 token = token3;
1652 } else {
1653 tok_backup(tok, c3);
1654 }
1655 *p_start = tok->start;
1656 *p_end = tok->cur;
1657 return token;
1658 }
1659 tok_backup(tok, c2);
1660 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001661
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 /* Keep track of parentheses nesting level */
1663 switch (c) {
1664 case '(':
1665 case '[':
1666 case '{':
1667 tok->level++;
1668 break;
1669 case ')':
1670 case ']':
1671 case '}':
1672 tok->level--;
1673 break;
1674 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001675
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676 /* Punctuation character */
1677 *p_start = tok->start;
1678 *p_end = tok->cur;
1679 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001680}
1681
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001682int
1683PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1684{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 int result = tok_get(tok, p_start, p_end);
1686 if (tok->decoding_erred) {
1687 result = ERRORTOKEN;
1688 tok->done = E_DECODE;
1689 }
1690 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001691}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001692
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001693/* Get the encoding of a Python file. Check for the coding cookie and check if
1694 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001695
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001696 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1697 encoding in the first or second line of the file (in which case the encoding
1698 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001699
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001700 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1701 by the caller. */
1702
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001703char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001704PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 struct tok_state *tok;
1707 FILE *fp;
1708 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001709
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 fd = dup(fd);
1711 if (fd < 0) {
1712 return NULL;
1713 }
1714 fp = fdopen(fd, "r");
1715 if (fp == NULL) {
1716 return NULL;
1717 }
1718 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1719 if (tok == NULL) {
1720 fclose(fp);
1721 return NULL;
1722 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001723#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001724 if (filename != NULL) {
1725 Py_INCREF(filename);
1726 tok->filename = filename;
1727 }
1728 else {
1729 tok->filename = PyUnicode_FromString("<string>");
1730 if (tok->filename == NULL) {
1731 fclose(fp);
1732 PyTokenizer_Free(tok);
1733 return encoding;
1734 }
1735 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001736#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 while (tok->lineno < 2 && tok->done == E_OK) {
1738 PyTokenizer_Get(tok, &p_start, &p_end);
1739 }
1740 fclose(fp);
1741 if (tok->encoding) {
1742 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1743 if (encoding)
1744 strcpy(encoding, tok->encoding);
1745 }
1746 PyTokenizer_Free(tok);
1747 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001748}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001749
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001750char *
1751PyTokenizer_FindEncoding(int fd)
1752{
1753 return PyTokenizer_FindEncodingFilename(fd, NULL);
1754}
1755
Guido van Rossum408027e1996-12-30 16:17:54 +00001756#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001757
1758void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001759tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001760{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001761 printf("%s", _PyParser_TokenNames[type]);
1762 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1763 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001764}
1765
1766#endif