blob: 23ea2eb6baaeca3c6e62460ecb129593e2d76845 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200465 _Py_identifier(open);
Victor Stinner22a351a2010-10-14 12:04:34 +0000466 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 io = PyImport_ImportModuleNoBlock("io");
469 if (io == NULL)
470 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000471
Victor Stinner22a351a2010-10-14 12:04:34 +0000472 fd = fileno(tok->fp);
473 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
474 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
475 goto cleanup;
476 }
477
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200478 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000479 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 if (stream == NULL)
481 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000483 Py_XDECREF(tok->decoding_readline);
484 readline = PyObject_GetAttrString(stream, "readline");
485 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000486
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000487 /* The file has been reopened; parsing will restart from
488 * the beginning of the file, we have to reset the line number.
489 * But this function has been called from inside tok_nextc() which
490 * will increment lineno before it returns. So we set it -1 so that
491 * the next call to tok_nextc() will start with tok->lineno == 0.
492 */
493 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000494
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000495 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496 Py_XDECREF(stream);
497 Py_XDECREF(io);
498 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499}
500
501/* Fetch the next byte from TOK. */
502
503static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000504 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505}
506
507/* Unfetch the last byte back into TOK. */
508
509static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000511}
512
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000513/* Check whether the characters at s start a valid
514 UTF-8 sequence. Return the number of characters forming
515 the sequence if yes, 0 if not. */
516static int valid_utf8(const unsigned char* s)
517{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 int expected = 0;
519 int length;
520 if (*s < 0x80)
521 /* single-byte code */
522 return 1;
523 if (*s < 0xc0)
524 /* following byte */
525 return 0;
526 if (*s < 0xE0)
527 expected = 1;
528 else if (*s < 0xF0)
529 expected = 2;
530 else if (*s < 0xF8)
531 expected = 3;
532 else
533 return 0;
534 length = expected + 1;
535 for (; expected; expected--)
536 if (s[expected] < 0x80 || s[expected] >= 0xC0)
537 return 0;
538 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000539}
540
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541/* Read a line of input from TOK. Determine encoding
542 if necessary. */
543
544static char *
545decoding_fgets(char *s, int size, struct tok_state *tok)
546{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000547 char *line = NULL;
548 int badchar = 0;
549 for (;;) {
550 if (tok->decoding_state == STATE_NORMAL) {
551 /* We already have a codec associated with
552 this input. */
553 line = fp_readl(s, size, tok);
554 break;
555 } else if (tok->decoding_state == STATE_RAW) {
556 /* We want a 'raw' read. */
557 line = Py_UniversalNewlineFgets(s, size,
558 tok->fp, NULL);
559 break;
560 } else {
561 /* We have not yet determined the encoding.
562 If an encoding is found, use the file-pointer
563 reader functions from now on. */
564 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
565 return error_ret(tok);
566 assert(tok->decoding_state != STATE_INIT);
567 }
568 }
569 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
570 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
571 return error_ret(tok);
572 }
573 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 /* The default encoding is UTF-8, so make sure we don't have any
576 non-UTF-8 sequences in it. */
577 if (line && !tok->encoding) {
578 unsigned char *c;
579 int length;
580 for (c = (unsigned char *)line; *c; c += length)
581 if (!(length = valid_utf8(c))) {
582 badchar = *c;
583 break;
584 }
585 }
586 if (badchar) {
587 /* Need to add 1 to the line number, since this line
588 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200589 PyErr_Format(PyExc_SyntaxError,
590 "Non-UTF-8 code starting with '\\x%.2x' "
591 "in file %U on line %i, "
592 "but no encoding declared; "
593 "see http://python.org/dev/peps/pep-0263/ for details",
594 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000595 return error_ret(tok);
596 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599}
600
601static int
602decoding_feof(struct tok_state *tok)
603{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 if (tok->decoding_state != STATE_NORMAL) {
605 return feof(tok->fp);
606 } else {
607 PyObject* buf = tok->decoding_buffer;
608 if (buf == NULL) {
609 buf = PyObject_CallObject(tok->decoding_readline, NULL);
610 if (buf == NULL) {
611 error_ret(tok);
612 return 1;
613 } else {
614 tok->decoding_buffer = buf;
615 }
616 }
617 return PyObject_Length(buf) == 0;
618 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619}
620
621/* Fetch a byte from TOK, using the string buffer. */
622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000623static int
624buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626}
627
628/* Unfetch a byte from TOK, using the string buffer. */
629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000630static void
631buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 tok->str--;
633 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000634}
635
636/* Set the readline function for TOK to ENC. For the string-based
637 tokenizer, this means to just record the encoding. */
638
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000639static int
640buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000641 tok->enc = enc;
642 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643}
644
645/* Return a UTF-8 encoding Python string object from the
646 C byte string STR, which is encoded with ENC. */
647
648static PyObject *
649translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 PyObject *utf8;
651 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
652 if (buf == NULL)
653 return NULL;
654 utf8 = PyUnicode_AsUTF8String(buf);
655 Py_DECREF(buf);
656 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000657}
658
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000659
660static char *
661translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
663 char *buf, *current;
664 char c = '\0';
665 buf = PyMem_MALLOC(needed_length);
666 if (buf == NULL) {
667 tok->done = E_NOMEM;
668 return NULL;
669 }
670 for (current = buf; *s; s++, current++) {
671 c = *s;
672 if (skip_next_lf) {
673 skip_next_lf = 0;
674 if (c == '\n') {
675 c = *++s;
676 if (!c)
677 break;
678 }
679 }
680 if (c == '\r') {
681 skip_next_lf = 1;
682 c = '\n';
683 }
684 *current = c;
685 }
686 /* If this is exec input, add a newline to the end of the string if
687 there isn't one already. */
688 if (exec_input && c != '\n') {
689 *current = '\n';
690 current++;
691 }
692 *current = '\0';
693 final_length = current - buf + 1;
694 if (final_length < needed_length && final_length)
695 /* should never fail */
696 buf = PyMem_REALLOC(buf, final_length);
697 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000698}
699
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700/* Decode a byte string STR for use as the buffer of TOK.
701 Look for encoding declarations inside STR, and record them
702 inside TOK. */
703
704static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000705decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000706{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000707 PyObject* utf8 = NULL;
708 const char *str;
709 const char *s;
710 const char *newl[2] = {NULL, NULL};
711 int lineno = 0;
712 tok->input = str = translate_newlines(input, single, tok);
713 if (str == NULL)
714 return NULL;
715 tok->enc = NULL;
716 tok->str = str;
717 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
718 return error_ret(tok);
719 str = tok->str; /* string after BOM if any */
720 assert(str);
721 if (tok->enc != NULL) {
722 utf8 = translate_into_utf8(str, tok->enc);
723 if (utf8 == NULL)
724 return error_ret(tok);
725 str = PyBytes_AsString(utf8);
726 }
727 for (s = str;; s++) {
728 if (*s == '\0') break;
729 else if (*s == '\n') {
730 assert(lineno < 2);
731 newl[lineno] = s;
732 lineno++;
733 if (lineno == 2) break;
734 }
735 }
736 tok->enc = NULL;
737 /* need to check line 1 and 2 separately since check_coding_spec
738 assumes a single line as input */
739 if (newl[0]) {
740 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
741 return error_ret(tok);
742 if (tok->enc == NULL && newl[1]) {
743 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
744 tok, buf_setreadl))
745 return error_ret(tok);
746 }
747 }
748 if (tok->enc != NULL) {
749 assert(utf8 == NULL);
750 utf8 = translate_into_utf8(str, tok->enc);
751 if (utf8 == NULL)
752 return error_ret(tok);
753 str = PyBytes_AS_STRING(utf8);
754 }
755 assert(tok->decoding_buffer == NULL);
756 tok->decoding_buffer = utf8; /* CAUTION */
757 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000758}
759
760#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761
762/* Set up tokenizer for string */
763
764struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000765PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 struct tok_state *tok = tok_new();
768 if (tok == NULL)
769 return NULL;
770 str = (char *)decode_str(str, exec_input, tok);
771 if (str == NULL) {
772 PyTokenizer_Free(tok);
773 return NULL;
774 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000775
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 /* XXX: constify members. */
777 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
778 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779}
780
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000781struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000782PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000783{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 struct tok_state *tok = tok_new();
785 if (tok == NULL)
786 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000787#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000789#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 if (str == NULL) {
791 PyTokenizer_Free(tok);
792 return NULL;
793 }
794 tok->decoding_state = STATE_RAW;
795 tok->read_coding_spec = 1;
796 tok->enc = NULL;
797 tok->str = str;
798 tok->encoding = (char *)PyMem_MALLOC(6);
799 if (!tok->encoding) {
800 PyTokenizer_Free(tok);
801 return NULL;
802 }
803 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000804
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 /* XXX: constify members. */
806 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
807 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000808}
809
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000810/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811
812struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000813PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000814{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000815 struct tok_state *tok = tok_new();
816 if (tok == NULL)
817 return NULL;
818 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
819 PyTokenizer_Free(tok);
820 return NULL;
821 }
822 tok->cur = tok->inp = tok->buf;
823 tok->end = tok->buf + BUFSIZ;
824 tok->fp = fp;
825 tok->prompt = ps1;
826 tok->nextprompt = ps2;
827 if (enc != NULL) {
828 /* Must copy encoding declaration since it
829 gets copied into the parse tree. */
830 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
831 if (!tok->encoding) {
832 PyTokenizer_Free(tok);
833 return NULL;
834 }
835 strcpy(tok->encoding, enc);
836 tok->decoding_state = STATE_NORMAL;
837 }
838 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000839}
840
841
842/* Free a tok_state structure */
843
844void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000845PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000847 if (tok->encoding != NULL)
848 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000849#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 Py_XDECREF(tok->decoding_readline);
851 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200852 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000853#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 if (tok->fp != NULL && tok->buf != NULL)
855 PyMem_FREE(tok->buf);
856 if (tok->input)
857 PyMem_FREE((char *)tok->input);
858 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859}
860
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861/* Get next char, updating state; error code goes into tok->done */
862
863static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000864tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000865{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000866 for (;;) {
867 if (tok->cur != tok->inp) {
868 return Py_CHARMASK(*tok->cur++); /* Fast path */
869 }
870 if (tok->done != E_OK)
871 return EOF;
872 if (tok->fp == NULL) {
873 char *end = strchr(tok->inp, '\n');
874 if (end != NULL)
875 end++;
876 else {
877 end = strchr(tok->inp, '\0');
878 if (end == tok->inp) {
879 tok->done = E_EOF;
880 return EOF;
881 }
882 }
883 if (tok->start == NULL)
884 tok->buf = tok->cur;
885 tok->line_start = tok->cur;
886 tok->lineno++;
887 tok->inp = end;
888 return Py_CHARMASK(*tok->cur++);
889 }
890 if (tok->prompt != NULL) {
891 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000892#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000893 if (newtok != NULL) {
894 char *translated = translate_newlines(newtok, 0, tok);
895 PyMem_FREE(newtok);
896 if (translated == NULL)
897 return EOF;
898 newtok = translated;
899 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 if (tok->encoding && newtok && *newtok) {
901 /* Recode to UTF-8 */
902 Py_ssize_t buflen;
903 const char* buf;
904 PyObject *u = translate_into_utf8(newtok, tok->encoding);
905 PyMem_FREE(newtok);
906 if (!u) {
907 tok->done = E_DECODE;
908 return EOF;
909 }
910 buflen = PyBytes_GET_SIZE(u);
911 buf = PyBytes_AS_STRING(u);
912 if (!buf) {
913 Py_DECREF(u);
914 tok->done = E_DECODE;
915 return EOF;
916 }
917 newtok = PyMem_MALLOC(buflen+1);
918 strcpy(newtok, buf);
919 Py_DECREF(u);
920 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000921#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000922 if (tok->nextprompt != NULL)
923 tok->prompt = tok->nextprompt;
924 if (newtok == NULL)
925 tok->done = E_INTR;
926 else if (*newtok == '\0') {
927 PyMem_FREE(newtok);
928 tok->done = E_EOF;
929 }
930 else if (tok->start != NULL) {
931 size_t start = tok->start - tok->buf;
932 size_t oldlen = tok->cur - tok->buf;
933 size_t newlen = oldlen + strlen(newtok);
934 char *buf = tok->buf;
935 buf = (char *)PyMem_REALLOC(buf, newlen+1);
936 tok->lineno++;
937 if (buf == NULL) {
938 PyMem_FREE(tok->buf);
939 tok->buf = NULL;
940 PyMem_FREE(newtok);
941 tok->done = E_NOMEM;
942 return EOF;
943 }
944 tok->buf = buf;
945 tok->cur = tok->buf + oldlen;
946 tok->line_start = tok->cur;
947 strcpy(tok->buf + oldlen, newtok);
948 PyMem_FREE(newtok);
949 tok->inp = tok->buf + newlen;
950 tok->end = tok->inp + 1;
951 tok->start = tok->buf + start;
952 }
953 else {
954 tok->lineno++;
955 if (tok->buf != NULL)
956 PyMem_FREE(tok->buf);
957 tok->buf = newtok;
958 tok->line_start = tok->buf;
959 tok->cur = tok->buf;
960 tok->line_start = tok->buf;
961 tok->inp = strchr(tok->buf, '\0');
962 tok->end = tok->inp + 1;
963 }
964 }
965 else {
966 int done = 0;
967 Py_ssize_t cur = 0;
968 char *pt;
969 if (tok->start == NULL) {
970 if (tok->buf == NULL) {
971 tok->buf = (char *)
972 PyMem_MALLOC(BUFSIZ);
973 if (tok->buf == NULL) {
974 tok->done = E_NOMEM;
975 return EOF;
976 }
977 tok->end = tok->buf + BUFSIZ;
978 }
979 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
980 tok) == NULL) {
981 tok->done = E_EOF;
982 done = 1;
983 }
984 else {
985 tok->done = E_OK;
986 tok->inp = strchr(tok->buf, '\0');
987 done = tok->inp[-1] == '\n';
988 }
989 }
990 else {
991 cur = tok->cur - tok->buf;
992 if (decoding_feof(tok)) {
993 tok->done = E_EOF;
994 done = 1;
995 }
996 else
997 tok->done = E_OK;
998 }
999 tok->lineno++;
1000 /* Read until '\n' or EOF */
1001 while (!done) {
1002 Py_ssize_t curstart = tok->start == NULL ? -1 :
1003 tok->start - tok->buf;
1004 Py_ssize_t curvalid = tok->inp - tok->buf;
1005 Py_ssize_t newsize = curvalid + BUFSIZ;
1006 char *newbuf = tok->buf;
1007 newbuf = (char *)PyMem_REALLOC(newbuf,
1008 newsize);
1009 if (newbuf == NULL) {
1010 tok->done = E_NOMEM;
1011 tok->cur = tok->inp;
1012 return EOF;
1013 }
1014 tok->buf = newbuf;
1015 tok->inp = tok->buf + curvalid;
1016 tok->end = tok->buf + newsize;
1017 tok->start = curstart < 0 ? NULL :
1018 tok->buf + curstart;
1019 if (decoding_fgets(tok->inp,
1020 (int)(tok->end - tok->inp),
1021 tok) == NULL) {
1022 /* Break out early on decoding
1023 errors, as tok->buf will be NULL
1024 */
1025 if (tok->decoding_erred)
1026 return EOF;
1027 /* Last line does not end in \n,
1028 fake one */
1029 strcpy(tok->inp, "\n");
1030 }
1031 tok->inp = strchr(tok->inp, '\0');
1032 done = tok->inp[-1] == '\n';
1033 }
1034 if (tok->buf != NULL) {
1035 tok->cur = tok->buf + cur;
1036 tok->line_start = tok->cur;
1037 /* replace "\r\n" with "\n" */
1038 /* For Mac leave the \r, giving a syntax error */
1039 pt = tok->inp - 2;
1040 if (pt >= tok->buf && *pt == '\r') {
1041 *pt++ = '\n';
1042 *pt = '\0';
1043 tok->inp = pt;
1044 }
1045 }
1046 }
1047 if (tok->done != E_OK) {
1048 if (tok->prompt != NULL)
1049 PySys_WriteStderr("\n");
1050 tok->cur = tok->inp;
1051 return EOF;
1052 }
1053 }
1054 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001055}
1056
1057
1058/* Back-up one character */
1059
1060static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001061tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001062{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 if (c != EOF) {
1064 if (--tok->cur < tok->buf)
1065 Py_FatalError("tok_backup: beginning of buffer");
1066 if (*tok->cur != c)
1067 *tok->cur = c;
1068 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001069}
1070
1071
1072/* Return the token corresponding to a single character */
1073
1074int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001075PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 switch (c) {
1078 case '(': return LPAR;
1079 case ')': return RPAR;
1080 case '[': return LSQB;
1081 case ']': return RSQB;
1082 case ':': return COLON;
1083 case ',': return COMMA;
1084 case ';': return SEMI;
1085 case '+': return PLUS;
1086 case '-': return MINUS;
1087 case '*': return STAR;
1088 case '/': return SLASH;
1089 case '|': return VBAR;
1090 case '&': return AMPER;
1091 case '<': return LESS;
1092 case '>': return GREATER;
1093 case '=': return EQUAL;
1094 case '.': return DOT;
1095 case '%': return PERCENT;
1096 case '{': return LBRACE;
1097 case '}': return RBRACE;
1098 case '^': return CIRCUMFLEX;
1099 case '~': return TILDE;
1100 case '@': return AT;
1101 default: return OP;
1102 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103}
1104
1105
Guido van Rossumfbab9051991-10-20 20:25:03 +00001106int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001107PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 switch (c1) {
1110 case '=':
1111 switch (c2) {
1112 case '=': return EQEQUAL;
1113 }
1114 break;
1115 case '!':
1116 switch (c2) {
1117 case '=': return NOTEQUAL;
1118 }
1119 break;
1120 case '<':
1121 switch (c2) {
1122 case '>': return NOTEQUAL;
1123 case '=': return LESSEQUAL;
1124 case '<': return LEFTSHIFT;
1125 }
1126 break;
1127 case '>':
1128 switch (c2) {
1129 case '=': return GREATEREQUAL;
1130 case '>': return RIGHTSHIFT;
1131 }
1132 break;
1133 case '+':
1134 switch (c2) {
1135 case '=': return PLUSEQUAL;
1136 }
1137 break;
1138 case '-':
1139 switch (c2) {
1140 case '=': return MINEQUAL;
1141 case '>': return RARROW;
1142 }
1143 break;
1144 case '*':
1145 switch (c2) {
1146 case '*': return DOUBLESTAR;
1147 case '=': return STAREQUAL;
1148 }
1149 break;
1150 case '/':
1151 switch (c2) {
1152 case '/': return DOUBLESLASH;
1153 case '=': return SLASHEQUAL;
1154 }
1155 break;
1156 case '|':
1157 switch (c2) {
1158 case '=': return VBAREQUAL;
1159 }
1160 break;
1161 case '%':
1162 switch (c2) {
1163 case '=': return PERCENTEQUAL;
1164 }
1165 break;
1166 case '&':
1167 switch (c2) {
1168 case '=': return AMPEREQUAL;
1169 }
1170 break;
1171 case '^':
1172 switch (c2) {
1173 case '=': return CIRCUMFLEXEQUAL;
1174 }
1175 break;
1176 }
1177 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001178}
1179
Thomas Wouters434d0822000-08-24 20:11:32 +00001180int
1181PyToken_ThreeChars(int c1, int c2, int c3)
1182{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001183 switch (c1) {
1184 case '<':
1185 switch (c2) {
1186 case '<':
1187 switch (c3) {
1188 case '=':
1189 return LEFTSHIFTEQUAL;
1190 }
1191 break;
1192 }
1193 break;
1194 case '>':
1195 switch (c2) {
1196 case '>':
1197 switch (c3) {
1198 case '=':
1199 return RIGHTSHIFTEQUAL;
1200 }
1201 break;
1202 }
1203 break;
1204 case '*':
1205 switch (c2) {
1206 case '*':
1207 switch (c3) {
1208 case '=':
1209 return DOUBLESTAREQUAL;
1210 }
1211 break;
1212 }
1213 break;
1214 case '/':
1215 switch (c2) {
1216 case '/':
1217 switch (c3) {
1218 case '=':
1219 return DOUBLESLASHEQUAL;
1220 }
1221 break;
1222 }
1223 break;
1224 case '.':
1225 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001226 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 switch (c3) {
1228 case '.':
1229 return ELLIPSIS;
1230 }
1231 break;
1232 }
1233 break;
1234 }
1235 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001236}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001237
Guido van Rossum926f13a1998-04-09 21:38:06 +00001238static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001239indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001240{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001241 if (tok->alterror) {
1242 tok->done = E_TABSPACE;
1243 tok->cur = tok->inp;
1244 return 1;
1245 }
1246 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001247#ifdef PGEN
1248 PySys_WriteStderr("inconsistent use of tabs and spaces "
1249 "in indentation\n");
1250#else
1251 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001253#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 tok->altwarning = 0;
1255 }
1256 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001257}
1258
Martin v. Löwis47383402007-08-15 07:32:56 +00001259#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001260#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001261#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262/* Verify that the identifier follows PEP 3131.
1263 All identifier strings are guaranteed to be "ready" unicode objects.
1264 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001265static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001266verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001267{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 PyObject *s;
1269 int result;
1270 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1273 PyErr_Clear();
1274 tok->done = E_IDENTIFIER;
1275 } else {
1276 tok->done = E_ERROR;
1277 }
1278 return 0;
1279 }
1280 result = PyUnicode_IsIdentifier(s);
1281 Py_DECREF(s);
1282 if (result == 0)
1283 tok->done = E_IDENTIFIER;
1284 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001285}
1286#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001287
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288/* Get next token, after space stripping etc. */
1289
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001290static int
1291tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 register int c;
1294 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001295
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001297 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 tok->start = NULL;
1299 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001300
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301 /* Get indentation level */
1302 if (tok->atbol) {
1303 register int col = 0;
1304 register int altcol = 0;
1305 tok->atbol = 0;
1306 for (;;) {
1307 c = tok_nextc(tok);
1308 if (c == ' ')
1309 col++, altcol++;
1310 else if (c == '\t') {
1311 col = (col/tok->tabsize + 1) * tok->tabsize;
1312 altcol = (altcol/tok->alttabsize + 1)
1313 * tok->alttabsize;
1314 }
1315 else if (c == '\014') /* Control-L (formfeed) */
1316 col = altcol = 0; /* For Emacs users */
1317 else
1318 break;
1319 }
1320 tok_backup(tok, c);
1321 if (c == '#' || c == '\n') {
1322 /* Lines with only whitespace and/or comments
1323 shouldn't affect the indentation and are
1324 not passed to the parser as NEWLINE tokens,
1325 except *totally* empty lines in interactive
1326 mode, which signal the end of a command group. */
1327 if (col == 0 && c == '\n' && tok->prompt != NULL)
1328 blankline = 0; /* Let it through */
1329 else
1330 blankline = 1; /* Ignore completely */
1331 /* We can't jump back right here since we still
1332 may need to skip to the end of a comment */
1333 }
1334 if (!blankline && tok->level == 0) {
1335 if (col == tok->indstack[tok->indent]) {
1336 /* No change */
1337 if (altcol != tok->altindstack[tok->indent]) {
1338 if (indenterror(tok))
1339 return ERRORTOKEN;
1340 }
1341 }
1342 else if (col > tok->indstack[tok->indent]) {
1343 /* Indent -- always one */
1344 if (tok->indent+1 >= MAXINDENT) {
1345 tok->done = E_TOODEEP;
1346 tok->cur = tok->inp;
1347 return ERRORTOKEN;
1348 }
1349 if (altcol <= tok->altindstack[tok->indent]) {
1350 if (indenterror(tok))
1351 return ERRORTOKEN;
1352 }
1353 tok->pendin++;
1354 tok->indstack[++tok->indent] = col;
1355 tok->altindstack[tok->indent] = altcol;
1356 }
1357 else /* col < tok->indstack[tok->indent] */ {
1358 /* Dedent -- any number, must be consistent */
1359 while (tok->indent > 0 &&
1360 col < tok->indstack[tok->indent]) {
1361 tok->pendin--;
1362 tok->indent--;
1363 }
1364 if (col != tok->indstack[tok->indent]) {
1365 tok->done = E_DEDENT;
1366 tok->cur = tok->inp;
1367 return ERRORTOKEN;
1368 }
1369 if (altcol != tok->altindstack[tok->indent]) {
1370 if (indenterror(tok))
1371 return ERRORTOKEN;
1372 }
1373 }
1374 }
1375 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001376
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 /* Return pending indents/dedents */
1380 if (tok->pendin != 0) {
1381 if (tok->pendin < 0) {
1382 tok->pendin++;
1383 return DEDENT;
1384 }
1385 else {
1386 tok->pendin--;
1387 return INDENT;
1388 }
1389 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 tok->start = NULL;
1393 /* Skip spaces */
1394 do {
1395 c = tok_nextc(tok);
1396 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001398 /* Set start of current token */
1399 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 /* Skip comment */
1402 if (c == '#')
1403 while (c != EOF && c != '\n')
1404 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001405
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406 /* Check for EOF and errors now */
1407 if (c == EOF) {
1408 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1409 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 /* Identifier (most frequent token!) */
1412 nonascii = 0;
1413 if (is_potential_identifier_start(c)) {
1414 /* Process b"", r"" and br"" */
1415 if (c == 'b' || c == 'B') {
1416 c = tok_nextc(tok);
1417 if (c == '"' || c == '\'')
1418 goto letter_quote;
1419 }
1420 if (c == 'r' || c == 'R') {
1421 c = tok_nextc(tok);
1422 if (c == '"' || c == '\'')
1423 goto letter_quote;
1424 }
1425 while (is_potential_identifier_char(c)) {
1426 if (c >= 128)
1427 nonascii = 1;
1428 c = tok_nextc(tok);
1429 }
1430 tok_backup(tok, c);
1431 if (nonascii &&
1432 !verify_identifier(tok)) {
1433 tok->done = E_IDENTIFIER;
1434 return ERRORTOKEN;
1435 }
1436 *p_start = tok->start;
1437 *p_end = tok->cur;
1438 return NAME;
1439 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 /* Newline */
1442 if (c == '\n') {
1443 tok->atbol = 1;
1444 if (blankline || tok->level > 0)
1445 goto nextline;
1446 *p_start = tok->start;
1447 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1448 tok->cont_line = 0;
1449 return NEWLINE;
1450 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 /* Period or number starting with period? */
1453 if (c == '.') {
1454 c = tok_nextc(tok);
1455 if (isdigit(c)) {
1456 goto fraction;
1457 } else if (c == '.') {
1458 c = tok_nextc(tok);
1459 if (c == '.') {
1460 *p_start = tok->start;
1461 *p_end = tok->cur;
1462 return ELLIPSIS;
1463 } else {
1464 tok_backup(tok, c);
1465 }
1466 tok_backup(tok, '.');
1467 } else {
1468 tok_backup(tok, c);
1469 }
1470 *p_start = tok->start;
1471 *p_end = tok->cur;
1472 return DOT;
1473 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001474
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001475 /* Number */
1476 if (isdigit(c)) {
1477 if (c == '0') {
1478 /* Hex, octal or binary -- maybe. */
1479 c = tok_nextc(tok);
1480 if (c == '.')
1481 goto fraction;
1482 if (c == 'j' || c == 'J')
1483 goto imaginary;
1484 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001485
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 /* Hex */
1487 c = tok_nextc(tok);
1488 if (!isxdigit(c)) {
1489 tok->done = E_TOKEN;
1490 tok_backup(tok, c);
1491 return ERRORTOKEN;
1492 }
1493 do {
1494 c = tok_nextc(tok);
1495 } while (isxdigit(c));
1496 }
1497 else if (c == 'o' || c == 'O') {
1498 /* Octal */
1499 c = tok_nextc(tok);
1500 if (c < '0' || c >= '8') {
1501 tok->done = E_TOKEN;
1502 tok_backup(tok, c);
1503 return ERRORTOKEN;
1504 }
1505 do {
1506 c = tok_nextc(tok);
1507 } while ('0' <= c && c < '8');
1508 }
1509 else if (c == 'b' || c == 'B') {
1510 /* Binary */
1511 c = tok_nextc(tok);
1512 if (c != '0' && c != '1') {
1513 tok->done = E_TOKEN;
1514 tok_backup(tok, c);
1515 return ERRORTOKEN;
1516 }
1517 do {
1518 c = tok_nextc(tok);
1519 } while (c == '0' || c == '1');
1520 }
1521 else {
1522 int nonzero = 0;
1523 /* maybe old-style octal; c is first char of it */
1524 /* in any case, allow '0' as a literal */
1525 while (c == '0')
1526 c = tok_nextc(tok);
1527 while (isdigit(c)) {
1528 nonzero = 1;
1529 c = tok_nextc(tok);
1530 }
1531 if (c == '.')
1532 goto fraction;
1533 else if (c == 'e' || c == 'E')
1534 goto exponent;
1535 else if (c == 'j' || c == 'J')
1536 goto imaginary;
1537 else if (nonzero) {
1538 tok->done = E_TOKEN;
1539 tok_backup(tok, c);
1540 return ERRORTOKEN;
1541 }
1542 }
1543 }
1544 else {
1545 /* Decimal */
1546 do {
1547 c = tok_nextc(tok);
1548 } while (isdigit(c));
1549 {
1550 /* Accept floating point numbers. */
1551 if (c == '.') {
1552 fraction:
1553 /* Fraction */
1554 do {
1555 c = tok_nextc(tok);
1556 } while (isdigit(c));
1557 }
1558 if (c == 'e' || c == 'E') {
1559 exponent:
1560 /* Exponent part */
1561 c = tok_nextc(tok);
1562 if (c == '+' || c == '-')
1563 c = tok_nextc(tok);
1564 if (!isdigit(c)) {
1565 tok->done = E_TOKEN;
1566 tok_backup(tok, c);
1567 return ERRORTOKEN;
1568 }
1569 do {
1570 c = tok_nextc(tok);
1571 } while (isdigit(c));
1572 }
1573 if (c == 'j' || c == 'J')
1574 /* Imaginary part */
1575 imaginary:
1576 c = tok_nextc(tok);
1577 }
1578 }
1579 tok_backup(tok, c);
1580 *p_start = tok->start;
1581 *p_end = tok->cur;
1582 return NUMBER;
1583 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001584
1585 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001586 /* String */
1587 if (c == '\'' || c == '"') {
1588 int quote = c;
1589 int quote_size = 1; /* 1 or 3 */
1590 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001591
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 /* Find the quote size and start of string */
1593 c = tok_nextc(tok);
1594 if (c == quote) {
1595 c = tok_nextc(tok);
1596 if (c == quote)
1597 quote_size = 3;
1598 else
1599 end_quote_size = 1; /* empty string found */
1600 }
1601 if (c != quote)
1602 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001603
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 /* Get rest of string */
1605 while (end_quote_size != quote_size) {
1606 c = tok_nextc(tok);
1607 if (c == EOF) {
1608 if (quote_size == 3)
1609 tok->done = E_EOFS;
1610 else
1611 tok->done = E_EOLS;
1612 tok->cur = tok->inp;
1613 return ERRORTOKEN;
1614 }
1615 if (quote_size == 1 && c == '\n') {
1616 tok->done = E_EOLS;
1617 tok->cur = tok->inp;
1618 return ERRORTOKEN;
1619 }
1620 if (c == quote)
1621 end_quote_size += 1;
1622 else {
1623 end_quote_size = 0;
1624 if (c == '\\')
1625 c = tok_nextc(tok); /* skip escaped char */
1626 }
1627 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001628
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 *p_start = tok->start;
1630 *p_end = tok->cur;
1631 return STRING;
1632 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001633
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 /* Line continuation */
1635 if (c == '\\') {
1636 c = tok_nextc(tok);
1637 if (c != '\n') {
1638 tok->done = E_LINECONT;
1639 tok->cur = tok->inp;
1640 return ERRORTOKEN;
1641 }
1642 tok->cont_line = 1;
1643 goto again; /* Read next line */
1644 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001645
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 /* Check for two-character token */
1647 {
1648 int c2 = tok_nextc(tok);
1649 int token = PyToken_TwoChars(c, c2);
1650 if (token != OP) {
1651 int c3 = tok_nextc(tok);
1652 int token3 = PyToken_ThreeChars(c, c2, c3);
1653 if (token3 != OP) {
1654 token = token3;
1655 } else {
1656 tok_backup(tok, c3);
1657 }
1658 *p_start = tok->start;
1659 *p_end = tok->cur;
1660 return token;
1661 }
1662 tok_backup(tok, c2);
1663 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001664
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 /* Keep track of parentheses nesting level */
1666 switch (c) {
1667 case '(':
1668 case '[':
1669 case '{':
1670 tok->level++;
1671 break;
1672 case ')':
1673 case ']':
1674 case '}':
1675 tok->level--;
1676 break;
1677 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001678
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 /* Punctuation character */
1680 *p_start = tok->start;
1681 *p_end = tok->cur;
1682 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001683}
1684
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001685int
1686PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1687{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 int result = tok_get(tok, p_start, p_end);
1689 if (tok->decoding_erred) {
1690 result = ERRORTOKEN;
1691 tok->done = E_DECODE;
1692 }
1693 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001694}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001695
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001696/* Get the encoding of a Python file. Check for the coding cookie and check if
1697 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001698
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001699 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1700 encoding in the first or second line of the file (in which case the encoding
1701 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001702
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001703 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1704 by the caller. */
1705
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001706char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001707PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001708{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001709 struct tok_state *tok;
1710 FILE *fp;
1711 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001712
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713 fd = dup(fd);
1714 if (fd < 0) {
1715 return NULL;
1716 }
1717 fp = fdopen(fd, "r");
1718 if (fp == NULL) {
1719 return NULL;
1720 }
1721 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1722 if (tok == NULL) {
1723 fclose(fp);
1724 return NULL;
1725 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001726#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001727 if (filename != NULL) {
1728 Py_INCREF(filename);
1729 tok->filename = filename;
1730 }
1731 else {
1732 tok->filename = PyUnicode_FromString("<string>");
1733 if (tok->filename == NULL) {
1734 fclose(fp);
1735 PyTokenizer_Free(tok);
1736 return encoding;
1737 }
1738 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001739#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 while (tok->lineno < 2 && tok->done == E_OK) {
1741 PyTokenizer_Get(tok, &p_start, &p_end);
1742 }
1743 fclose(fp);
1744 if (tok->encoding) {
1745 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1746 if (encoding)
1747 strcpy(encoding, tok->encoding);
1748 }
1749 PyTokenizer_Free(tok);
1750 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001751}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001752
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001753char *
1754PyTokenizer_FindEncoding(int fd)
1755{
1756 return PyTokenizer_FindEncodingFilename(fd, NULL);
1757}
1758
Guido van Rossum408027e1996-12-30 16:17:54 +00001759#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001760
1761void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001762tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001763{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001764 printf("%s", _PyParser_TokenNames[type]);
1765 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1766 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001767}
1768
1769#endif