blob: f7ca59823285163d6bb6ddebe5f0256cfdcfe993 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Victor Stinner22a351a2010-10-14 12:04:34 +0000465 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 io = PyImport_ImportModuleNoBlock("io");
468 if (io == NULL)
469 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000470
Victor Stinner22a351a2010-10-14 12:04:34 +0000471 fd = fileno(tok->fp);
472 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
473 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
474 goto cleanup;
475 }
476
477 stream = PyObject_CallMethod(io, "open", "isisOOO",
478 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 if (stream == NULL)
480 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 Py_XDECREF(tok->decoding_readline);
483 readline = PyObject_GetAttrString(stream, "readline");
484 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 /* The file has been reopened; parsing will restart from
487 * the beginning of the file, we have to reset the line number.
488 * But this function has been called from inside tok_nextc() which
489 * will increment lineno before it returns. So we set it -1 so that
490 * the next call to tok_nextc() will start with tok->lineno == 0.
491 */
492 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000493
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000494 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(stream);
496 Py_XDECREF(io);
497 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498}
499
500/* Fetch the next byte from TOK. */
501
502static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504}
505
506/* Unfetch the last byte back into TOK. */
507
508static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510}
511
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000512/* Check whether the characters at s start a valid
513 UTF-8 sequence. Return the number of characters forming
514 the sequence if yes, 0 if not. */
515static int valid_utf8(const unsigned char* s)
516{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 int expected = 0;
518 int length;
519 if (*s < 0x80)
520 /* single-byte code */
521 return 1;
522 if (*s < 0xc0)
523 /* following byte */
524 return 0;
525 if (*s < 0xE0)
526 expected = 1;
527 else if (*s < 0xF0)
528 expected = 2;
529 else if (*s < 0xF8)
530 expected = 3;
531 else
532 return 0;
533 length = expected + 1;
534 for (; expected; expected--)
535 if (s[expected] < 0x80 || s[expected] >= 0xC0)
536 return 0;
537 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000538}
539
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540/* Read a line of input from TOK. Determine encoding
541 if necessary. */
542
543static char *
544decoding_fgets(char *s, int size, struct tok_state *tok)
545{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 char *line = NULL;
547 int badchar = 0;
548 for (;;) {
549 if (tok->decoding_state == STATE_NORMAL) {
550 /* We already have a codec associated with
551 this input. */
552 line = fp_readl(s, size, tok);
553 break;
554 } else if (tok->decoding_state == STATE_RAW) {
555 /* We want a 'raw' read. */
556 line = Py_UniversalNewlineFgets(s, size,
557 tok->fp, NULL);
558 break;
559 } else {
560 /* We have not yet determined the encoding.
561 If an encoding is found, use the file-pointer
562 reader functions from now on. */
563 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
564 return error_ret(tok);
565 assert(tok->decoding_state != STATE_INIT);
566 }
567 }
568 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
569 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
570 return error_ret(tok);
571 }
572 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000574 /* The default encoding is UTF-8, so make sure we don't have any
575 non-UTF-8 sequences in it. */
576 if (line && !tok->encoding) {
577 unsigned char *c;
578 int length;
579 for (c = (unsigned char *)line; *c; c += length)
580 if (!(length = valid_utf8(c))) {
581 badchar = *c;
582 break;
583 }
584 }
585 if (badchar) {
586 /* Need to add 1 to the line number, since this line
587 has not been counted, yet. */
Victor Stinnerc68b6aa2011-04-23 00:41:19 +0200588 if (tok->filename != NULL)
589 filename = PyUnicode_DecodeFSDefault(tok->filename);
590 else
591 filename = PyUnicode_FromString("<file>");
Victor Stinner83098a42010-12-27 20:12:13 +0000592 if (filename != NULL) {
593 PyErr_Format(PyExc_SyntaxError,
594 "Non-UTF-8 code starting with '\\x%.2x' "
Victor Stinneraaa4e9a2011-01-05 03:33:26 +0000595 "in file %U on line %i, "
Victor Stinner83098a42010-12-27 20:12:13 +0000596 "but no encoding declared; "
597 "see http://python.org/dev/peps/pep-0263/ for details",
598 badchar, filename, tok->lineno + 1);
599 Py_DECREF(filename);
600 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 return error_ret(tok);
602 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605}
606
607static int
608decoding_feof(struct tok_state *tok)
609{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 if (tok->decoding_state != STATE_NORMAL) {
611 return feof(tok->fp);
612 } else {
613 PyObject* buf = tok->decoding_buffer;
614 if (buf == NULL) {
615 buf = PyObject_CallObject(tok->decoding_readline, NULL);
616 if (buf == NULL) {
617 error_ret(tok);
618 return 1;
619 } else {
620 tok->decoding_buffer = buf;
621 }
622 }
623 return PyObject_Length(buf) == 0;
624 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625}
626
627/* Fetch a byte from TOK, using the string buffer. */
628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000629static int
630buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
634/* Unfetch a byte from TOK, using the string buffer. */
635
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000636static void
637buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000638 tok->str--;
639 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640}
641
642/* Set the readline function for TOK to ENC. For the string-based
643 tokenizer, this means to just record the encoding. */
644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000645static int
646buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 tok->enc = enc;
648 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649}
650
651/* Return a UTF-8 encoding Python string object from the
652 C byte string STR, which is encoded with ENC. */
653
654static PyObject *
655translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000656 PyObject *utf8;
657 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
658 if (buf == NULL)
659 return NULL;
660 utf8 = PyUnicode_AsUTF8String(buf);
661 Py_DECREF(buf);
662 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663}
664
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000665
666static char *
667translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
669 char *buf, *current;
670 char c = '\0';
671 buf = PyMem_MALLOC(needed_length);
672 if (buf == NULL) {
673 tok->done = E_NOMEM;
674 return NULL;
675 }
676 for (current = buf; *s; s++, current++) {
677 c = *s;
678 if (skip_next_lf) {
679 skip_next_lf = 0;
680 if (c == '\n') {
681 c = *++s;
682 if (!c)
683 break;
684 }
685 }
686 if (c == '\r') {
687 skip_next_lf = 1;
688 c = '\n';
689 }
690 *current = c;
691 }
692 /* If this is exec input, add a newline to the end of the string if
693 there isn't one already. */
694 if (exec_input && c != '\n') {
695 *current = '\n';
696 current++;
697 }
698 *current = '\0';
699 final_length = current - buf + 1;
700 if (final_length < needed_length && final_length)
701 /* should never fail */
702 buf = PyMem_REALLOC(buf, final_length);
703 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704}
705
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000706/* Decode a byte string STR for use as the buffer of TOK.
707 Look for encoding declarations inside STR, and record them
708 inside TOK. */
709
710static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000711decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000712{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyObject* utf8 = NULL;
714 const char *str;
715 const char *s;
716 const char *newl[2] = {NULL, NULL};
717 int lineno = 0;
718 tok->input = str = translate_newlines(input, single, tok);
719 if (str == NULL)
720 return NULL;
721 tok->enc = NULL;
722 tok->str = str;
723 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
724 return error_ret(tok);
725 str = tok->str; /* string after BOM if any */
726 assert(str);
727 if (tok->enc != NULL) {
728 utf8 = translate_into_utf8(str, tok->enc);
729 if (utf8 == NULL)
730 return error_ret(tok);
731 str = PyBytes_AsString(utf8);
732 }
733 for (s = str;; s++) {
734 if (*s == '\0') break;
735 else if (*s == '\n') {
736 assert(lineno < 2);
737 newl[lineno] = s;
738 lineno++;
739 if (lineno == 2) break;
740 }
741 }
742 tok->enc = NULL;
743 /* need to check line 1 and 2 separately since check_coding_spec
744 assumes a single line as input */
745 if (newl[0]) {
746 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
747 return error_ret(tok);
748 if (tok->enc == NULL && newl[1]) {
749 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
750 tok, buf_setreadl))
751 return error_ret(tok);
752 }
753 }
754 if (tok->enc != NULL) {
755 assert(utf8 == NULL);
756 utf8 = translate_into_utf8(str, tok->enc);
757 if (utf8 == NULL)
758 return error_ret(tok);
759 str = PyBytes_AS_STRING(utf8);
760 }
761 assert(tok->decoding_buffer == NULL);
762 tok->decoding_buffer = utf8; /* CAUTION */
763 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000764}
765
766#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767
768/* Set up tokenizer for string */
769
770struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000771PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 struct tok_state *tok = tok_new();
774 if (tok == NULL)
775 return NULL;
776 str = (char *)decode_str(str, exec_input, tok);
777 if (str == NULL) {
778 PyTokenizer_Free(tok);
779 return NULL;
780 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000781
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 /* XXX: constify members. */
783 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
784 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785}
786
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000787struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000788PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000789{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 struct tok_state *tok = tok_new();
791 if (tok == NULL)
792 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000793#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000795#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 if (str == NULL) {
797 PyTokenizer_Free(tok);
798 return NULL;
799 }
800 tok->decoding_state = STATE_RAW;
801 tok->read_coding_spec = 1;
802 tok->enc = NULL;
803 tok->str = str;
804 tok->encoding = (char *)PyMem_MALLOC(6);
805 if (!tok->encoding) {
806 PyTokenizer_Free(tok);
807 return NULL;
808 }
809 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000810
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 /* XXX: constify members. */
812 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
813 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000814}
815
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000816/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817
818struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000819PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 struct tok_state *tok = tok_new();
822 if (tok == NULL)
823 return NULL;
824 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
825 PyTokenizer_Free(tok);
826 return NULL;
827 }
828 tok->cur = tok->inp = tok->buf;
829 tok->end = tok->buf + BUFSIZ;
830 tok->fp = fp;
831 tok->prompt = ps1;
832 tok->nextprompt = ps2;
833 if (enc != NULL) {
834 /* Must copy encoding declaration since it
835 gets copied into the parse tree. */
836 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
837 if (!tok->encoding) {
838 PyTokenizer_Free(tok);
839 return NULL;
840 }
841 strcpy(tok->encoding, enc);
842 tok->decoding_state = STATE_NORMAL;
843 }
844 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845}
846
847
848/* Free a tok_state structure */
849
850void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000851PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 if (tok->encoding != NULL)
854 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000855#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856 Py_XDECREF(tok->decoding_readline);
857 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200858 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000859#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 if (tok->fp != NULL && tok->buf != NULL)
861 PyMem_FREE(tok->buf);
862 if (tok->input)
863 PyMem_FREE((char *)tok->input);
864 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000865}
866
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000867/* Get next char, updating state; error code goes into tok->done */
868
869static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000870tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000871{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000872 for (;;) {
873 if (tok->cur != tok->inp) {
874 return Py_CHARMASK(*tok->cur++); /* Fast path */
875 }
876 if (tok->done != E_OK)
877 return EOF;
878 if (tok->fp == NULL) {
879 char *end = strchr(tok->inp, '\n');
880 if (end != NULL)
881 end++;
882 else {
883 end = strchr(tok->inp, '\0');
884 if (end == tok->inp) {
885 tok->done = E_EOF;
886 return EOF;
887 }
888 }
889 if (tok->start == NULL)
890 tok->buf = tok->cur;
891 tok->line_start = tok->cur;
892 tok->lineno++;
893 tok->inp = end;
894 return Py_CHARMASK(*tok->cur++);
895 }
896 if (tok->prompt != NULL) {
897 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000898#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000899 if (newtok != NULL) {
900 char *translated = translate_newlines(newtok, 0, tok);
901 PyMem_FREE(newtok);
902 if (translated == NULL)
903 return EOF;
904 newtok = translated;
905 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 if (tok->encoding && newtok && *newtok) {
907 /* Recode to UTF-8 */
908 Py_ssize_t buflen;
909 const char* buf;
910 PyObject *u = translate_into_utf8(newtok, tok->encoding);
911 PyMem_FREE(newtok);
912 if (!u) {
913 tok->done = E_DECODE;
914 return EOF;
915 }
916 buflen = PyBytes_GET_SIZE(u);
917 buf = PyBytes_AS_STRING(u);
918 if (!buf) {
919 Py_DECREF(u);
920 tok->done = E_DECODE;
921 return EOF;
922 }
923 newtok = PyMem_MALLOC(buflen+1);
924 strcpy(newtok, buf);
925 Py_DECREF(u);
926 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000927#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000928 if (tok->nextprompt != NULL)
929 tok->prompt = tok->nextprompt;
930 if (newtok == NULL)
931 tok->done = E_INTR;
932 else if (*newtok == '\0') {
933 PyMem_FREE(newtok);
934 tok->done = E_EOF;
935 }
936 else if (tok->start != NULL) {
937 size_t start = tok->start - tok->buf;
938 size_t oldlen = tok->cur - tok->buf;
939 size_t newlen = oldlen + strlen(newtok);
940 char *buf = tok->buf;
941 buf = (char *)PyMem_REALLOC(buf, newlen+1);
942 tok->lineno++;
943 if (buf == NULL) {
944 PyMem_FREE(tok->buf);
945 tok->buf = NULL;
946 PyMem_FREE(newtok);
947 tok->done = E_NOMEM;
948 return EOF;
949 }
950 tok->buf = buf;
951 tok->cur = tok->buf + oldlen;
952 tok->line_start = tok->cur;
953 strcpy(tok->buf + oldlen, newtok);
954 PyMem_FREE(newtok);
955 tok->inp = tok->buf + newlen;
956 tok->end = tok->inp + 1;
957 tok->start = tok->buf + start;
958 }
959 else {
960 tok->lineno++;
961 if (tok->buf != NULL)
962 PyMem_FREE(tok->buf);
963 tok->buf = newtok;
964 tok->line_start = tok->buf;
965 tok->cur = tok->buf;
966 tok->line_start = tok->buf;
967 tok->inp = strchr(tok->buf, '\0');
968 tok->end = tok->inp + 1;
969 }
970 }
971 else {
972 int done = 0;
973 Py_ssize_t cur = 0;
974 char *pt;
975 if (tok->start == NULL) {
976 if (tok->buf == NULL) {
977 tok->buf = (char *)
978 PyMem_MALLOC(BUFSIZ);
979 if (tok->buf == NULL) {
980 tok->done = E_NOMEM;
981 return EOF;
982 }
983 tok->end = tok->buf + BUFSIZ;
984 }
985 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
986 tok) == NULL) {
987 tok->done = E_EOF;
988 done = 1;
989 }
990 else {
991 tok->done = E_OK;
992 tok->inp = strchr(tok->buf, '\0');
993 done = tok->inp[-1] == '\n';
994 }
995 }
996 else {
997 cur = tok->cur - tok->buf;
998 if (decoding_feof(tok)) {
999 tok->done = E_EOF;
1000 done = 1;
1001 }
1002 else
1003 tok->done = E_OK;
1004 }
1005 tok->lineno++;
1006 /* Read until '\n' or EOF */
1007 while (!done) {
1008 Py_ssize_t curstart = tok->start == NULL ? -1 :
1009 tok->start - tok->buf;
1010 Py_ssize_t curvalid = tok->inp - tok->buf;
1011 Py_ssize_t newsize = curvalid + BUFSIZ;
1012 char *newbuf = tok->buf;
1013 newbuf = (char *)PyMem_REALLOC(newbuf,
1014 newsize);
1015 if (newbuf == NULL) {
1016 tok->done = E_NOMEM;
1017 tok->cur = tok->inp;
1018 return EOF;
1019 }
1020 tok->buf = newbuf;
1021 tok->inp = tok->buf + curvalid;
1022 tok->end = tok->buf + newsize;
1023 tok->start = curstart < 0 ? NULL :
1024 tok->buf + curstart;
1025 if (decoding_fgets(tok->inp,
1026 (int)(tok->end - tok->inp),
1027 tok) == NULL) {
1028 /* Break out early on decoding
1029 errors, as tok->buf will be NULL
1030 */
1031 if (tok->decoding_erred)
1032 return EOF;
1033 /* Last line does not end in \n,
1034 fake one */
1035 strcpy(tok->inp, "\n");
1036 }
1037 tok->inp = strchr(tok->inp, '\0');
1038 done = tok->inp[-1] == '\n';
1039 }
1040 if (tok->buf != NULL) {
1041 tok->cur = tok->buf + cur;
1042 tok->line_start = tok->cur;
1043 /* replace "\r\n" with "\n" */
1044 /* For Mac leave the \r, giving a syntax error */
1045 pt = tok->inp - 2;
1046 if (pt >= tok->buf && *pt == '\r') {
1047 *pt++ = '\n';
1048 *pt = '\0';
1049 tok->inp = pt;
1050 }
1051 }
1052 }
1053 if (tok->done != E_OK) {
1054 if (tok->prompt != NULL)
1055 PySys_WriteStderr("\n");
1056 tok->cur = tok->inp;
1057 return EOF;
1058 }
1059 }
1060 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061}
1062
1063
1064/* Back-up one character */
1065
1066static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001067tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001068{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 if (c != EOF) {
1070 if (--tok->cur < tok->buf)
1071 Py_FatalError("tok_backup: beginning of buffer");
1072 if (*tok->cur != c)
1073 *tok->cur = c;
1074 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001075}
1076
1077
1078/* Return the token corresponding to a single character */
1079
1080int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001081PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001082{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001083 switch (c) {
1084 case '(': return LPAR;
1085 case ')': return RPAR;
1086 case '[': return LSQB;
1087 case ']': return RSQB;
1088 case ':': return COLON;
1089 case ',': return COMMA;
1090 case ';': return SEMI;
1091 case '+': return PLUS;
1092 case '-': return MINUS;
1093 case '*': return STAR;
1094 case '/': return SLASH;
1095 case '|': return VBAR;
1096 case '&': return AMPER;
1097 case '<': return LESS;
1098 case '>': return GREATER;
1099 case '=': return EQUAL;
1100 case '.': return DOT;
1101 case '%': return PERCENT;
1102 case '{': return LBRACE;
1103 case '}': return RBRACE;
1104 case '^': return CIRCUMFLEX;
1105 case '~': return TILDE;
1106 case '@': return AT;
1107 default: return OP;
1108 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109}
1110
1111
Guido van Rossumfbab9051991-10-20 20:25:03 +00001112int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001113PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 switch (c1) {
1116 case '=':
1117 switch (c2) {
1118 case '=': return EQEQUAL;
1119 }
1120 break;
1121 case '!':
1122 switch (c2) {
1123 case '=': return NOTEQUAL;
1124 }
1125 break;
1126 case '<':
1127 switch (c2) {
1128 case '>': return NOTEQUAL;
1129 case '=': return LESSEQUAL;
1130 case '<': return LEFTSHIFT;
1131 }
1132 break;
1133 case '>':
1134 switch (c2) {
1135 case '=': return GREATEREQUAL;
1136 case '>': return RIGHTSHIFT;
1137 }
1138 break;
1139 case '+':
1140 switch (c2) {
1141 case '=': return PLUSEQUAL;
1142 }
1143 break;
1144 case '-':
1145 switch (c2) {
1146 case '=': return MINEQUAL;
1147 case '>': return RARROW;
1148 }
1149 break;
1150 case '*':
1151 switch (c2) {
1152 case '*': return DOUBLESTAR;
1153 case '=': return STAREQUAL;
1154 }
1155 break;
1156 case '/':
1157 switch (c2) {
1158 case '/': return DOUBLESLASH;
1159 case '=': return SLASHEQUAL;
1160 }
1161 break;
1162 case '|':
1163 switch (c2) {
1164 case '=': return VBAREQUAL;
1165 }
1166 break;
1167 case '%':
1168 switch (c2) {
1169 case '=': return PERCENTEQUAL;
1170 }
1171 break;
1172 case '&':
1173 switch (c2) {
1174 case '=': return AMPEREQUAL;
1175 }
1176 break;
1177 case '^':
1178 switch (c2) {
1179 case '=': return CIRCUMFLEXEQUAL;
1180 }
1181 break;
1182 }
1183 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001184}
1185
Thomas Wouters434d0822000-08-24 20:11:32 +00001186int
1187PyToken_ThreeChars(int c1, int c2, int c3)
1188{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 switch (c1) {
1190 case '<':
1191 switch (c2) {
1192 case '<':
1193 switch (c3) {
1194 case '=':
1195 return LEFTSHIFTEQUAL;
1196 }
1197 break;
1198 }
1199 break;
1200 case '>':
1201 switch (c2) {
1202 case '>':
1203 switch (c3) {
1204 case '=':
1205 return RIGHTSHIFTEQUAL;
1206 }
1207 break;
1208 }
1209 break;
1210 case '*':
1211 switch (c2) {
1212 case '*':
1213 switch (c3) {
1214 case '=':
1215 return DOUBLESTAREQUAL;
1216 }
1217 break;
1218 }
1219 break;
1220 case '/':
1221 switch (c2) {
1222 case '/':
1223 switch (c3) {
1224 case '=':
1225 return DOUBLESLASHEQUAL;
1226 }
1227 break;
1228 }
1229 break;
1230 case '.':
1231 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001232 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 switch (c3) {
1234 case '.':
1235 return ELLIPSIS;
1236 }
1237 break;
1238 }
1239 break;
1240 }
1241 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001242}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001243
Guido van Rossum926f13a1998-04-09 21:38:06 +00001244static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001245indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001246{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001247 if (tok->alterror) {
1248 tok->done = E_TABSPACE;
1249 tok->cur = tok->inp;
1250 return 1;
1251 }
1252 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001253#ifdef PGEN
1254 PySys_WriteStderr("inconsistent use of tabs and spaces "
1255 "in indentation\n");
1256#else
1257 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001259#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 tok->altwarning = 0;
1261 }
1262 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001263}
1264
Martin v. Löwis47383402007-08-15 07:32:56 +00001265#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001266#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001267#else
1268/* Verify that the identifier follows PEP 3131. */
1269static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001270verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001271{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 PyObject *s;
1273 int result;
1274 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1275 if (s == NULL) {
1276 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1277 PyErr_Clear();
1278 tok->done = E_IDENTIFIER;
1279 } else {
1280 tok->done = E_ERROR;
1281 }
1282 return 0;
1283 }
1284 result = PyUnicode_IsIdentifier(s);
1285 Py_DECREF(s);
1286 if (result == 0)
1287 tok->done = E_IDENTIFIER;
1288 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001289}
1290#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001291
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292/* Get next token, after space stripping etc. */
1293
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001294static int
1295tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 register int c;
1298 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001299
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001301 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 tok->start = NULL;
1303 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001304
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 /* Get indentation level */
1306 if (tok->atbol) {
1307 register int col = 0;
1308 register int altcol = 0;
1309 tok->atbol = 0;
1310 for (;;) {
1311 c = tok_nextc(tok);
1312 if (c == ' ')
1313 col++, altcol++;
1314 else if (c == '\t') {
1315 col = (col/tok->tabsize + 1) * tok->tabsize;
1316 altcol = (altcol/tok->alttabsize + 1)
1317 * tok->alttabsize;
1318 }
1319 else if (c == '\014') /* Control-L (formfeed) */
1320 col = altcol = 0; /* For Emacs users */
1321 else
1322 break;
1323 }
1324 tok_backup(tok, c);
1325 if (c == '#' || c == '\n') {
1326 /* Lines with only whitespace and/or comments
1327 shouldn't affect the indentation and are
1328 not passed to the parser as NEWLINE tokens,
1329 except *totally* empty lines in interactive
1330 mode, which signal the end of a command group. */
1331 if (col == 0 && c == '\n' && tok->prompt != NULL)
1332 blankline = 0; /* Let it through */
1333 else
1334 blankline = 1; /* Ignore completely */
1335 /* We can't jump back right here since we still
1336 may need to skip to the end of a comment */
1337 }
1338 if (!blankline && tok->level == 0) {
1339 if (col == tok->indstack[tok->indent]) {
1340 /* No change */
1341 if (altcol != tok->altindstack[tok->indent]) {
1342 if (indenterror(tok))
1343 return ERRORTOKEN;
1344 }
1345 }
1346 else if (col > tok->indstack[tok->indent]) {
1347 /* Indent -- always one */
1348 if (tok->indent+1 >= MAXINDENT) {
1349 tok->done = E_TOODEEP;
1350 tok->cur = tok->inp;
1351 return ERRORTOKEN;
1352 }
1353 if (altcol <= tok->altindstack[tok->indent]) {
1354 if (indenterror(tok))
1355 return ERRORTOKEN;
1356 }
1357 tok->pendin++;
1358 tok->indstack[++tok->indent] = col;
1359 tok->altindstack[tok->indent] = altcol;
1360 }
1361 else /* col < tok->indstack[tok->indent] */ {
1362 /* Dedent -- any number, must be consistent */
1363 while (tok->indent > 0 &&
1364 col < tok->indstack[tok->indent]) {
1365 tok->pendin--;
1366 tok->indent--;
1367 }
1368 if (col != tok->indstack[tok->indent]) {
1369 tok->done = E_DEDENT;
1370 tok->cur = tok->inp;
1371 return ERRORTOKEN;
1372 }
1373 if (altcol != tok->altindstack[tok->indent]) {
1374 if (indenterror(tok))
1375 return ERRORTOKEN;
1376 }
1377 }
1378 }
1379 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001380
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001382
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 /* Return pending indents/dedents */
1384 if (tok->pendin != 0) {
1385 if (tok->pendin < 0) {
1386 tok->pendin++;
1387 return DEDENT;
1388 }
1389 else {
1390 tok->pendin--;
1391 return INDENT;
1392 }
1393 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001394
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 tok->start = NULL;
1397 /* Skip spaces */
1398 do {
1399 c = tok_nextc(tok);
1400 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 /* Set start of current token */
1403 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001405 /* Skip comment */
1406 if (c == '#')
1407 while (c != EOF && c != '\n')
1408 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 /* Check for EOF and errors now */
1411 if (c == EOF) {
1412 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1413 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 /* Identifier (most frequent token!) */
1416 nonascii = 0;
1417 if (is_potential_identifier_start(c)) {
1418 /* Process b"", r"" and br"" */
1419 if (c == 'b' || c == 'B') {
1420 c = tok_nextc(tok);
1421 if (c == '"' || c == '\'')
1422 goto letter_quote;
1423 }
1424 if (c == 'r' || c == 'R') {
1425 c = tok_nextc(tok);
1426 if (c == '"' || c == '\'')
1427 goto letter_quote;
1428 }
1429 while (is_potential_identifier_char(c)) {
1430 if (c >= 128)
1431 nonascii = 1;
1432 c = tok_nextc(tok);
1433 }
1434 tok_backup(tok, c);
1435 if (nonascii &&
1436 !verify_identifier(tok)) {
1437 tok->done = E_IDENTIFIER;
1438 return ERRORTOKEN;
1439 }
1440 *p_start = tok->start;
1441 *p_end = tok->cur;
1442 return NAME;
1443 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 /* Newline */
1446 if (c == '\n') {
1447 tok->atbol = 1;
1448 if (blankline || tok->level > 0)
1449 goto nextline;
1450 *p_start = tok->start;
1451 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1452 tok->cont_line = 0;
1453 return NEWLINE;
1454 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001455
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 /* Period or number starting with period? */
1457 if (c == '.') {
1458 c = tok_nextc(tok);
1459 if (isdigit(c)) {
1460 goto fraction;
1461 } else if (c == '.') {
1462 c = tok_nextc(tok);
1463 if (c == '.') {
1464 *p_start = tok->start;
1465 *p_end = tok->cur;
1466 return ELLIPSIS;
1467 } else {
1468 tok_backup(tok, c);
1469 }
1470 tok_backup(tok, '.');
1471 } else {
1472 tok_backup(tok, c);
1473 }
1474 *p_start = tok->start;
1475 *p_end = tok->cur;
1476 return DOT;
1477 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001478
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001479 /* Number */
1480 if (isdigit(c)) {
1481 if (c == '0') {
1482 /* Hex, octal or binary -- maybe. */
1483 c = tok_nextc(tok);
1484 if (c == '.')
1485 goto fraction;
1486 if (c == 'j' || c == 'J')
1487 goto imaginary;
1488 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001489
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 /* Hex */
1491 c = tok_nextc(tok);
1492 if (!isxdigit(c)) {
1493 tok->done = E_TOKEN;
1494 tok_backup(tok, c);
1495 return ERRORTOKEN;
1496 }
1497 do {
1498 c = tok_nextc(tok);
1499 } while (isxdigit(c));
1500 }
1501 else if (c == 'o' || c == 'O') {
1502 /* Octal */
1503 c = tok_nextc(tok);
1504 if (c < '0' || c >= '8') {
1505 tok->done = E_TOKEN;
1506 tok_backup(tok, c);
1507 return ERRORTOKEN;
1508 }
1509 do {
1510 c = tok_nextc(tok);
1511 } while ('0' <= c && c < '8');
1512 }
1513 else if (c == 'b' || c == 'B') {
1514 /* Binary */
1515 c = tok_nextc(tok);
1516 if (c != '0' && c != '1') {
1517 tok->done = E_TOKEN;
1518 tok_backup(tok, c);
1519 return ERRORTOKEN;
1520 }
1521 do {
1522 c = tok_nextc(tok);
1523 } while (c == '0' || c == '1');
1524 }
1525 else {
1526 int nonzero = 0;
1527 /* maybe old-style octal; c is first char of it */
1528 /* in any case, allow '0' as a literal */
1529 while (c == '0')
1530 c = tok_nextc(tok);
1531 while (isdigit(c)) {
1532 nonzero = 1;
1533 c = tok_nextc(tok);
1534 }
1535 if (c == '.')
1536 goto fraction;
1537 else if (c == 'e' || c == 'E')
1538 goto exponent;
1539 else if (c == 'j' || c == 'J')
1540 goto imaginary;
1541 else if (nonzero) {
1542 tok->done = E_TOKEN;
1543 tok_backup(tok, c);
1544 return ERRORTOKEN;
1545 }
1546 }
1547 }
1548 else {
1549 /* Decimal */
1550 do {
1551 c = tok_nextc(tok);
1552 } while (isdigit(c));
1553 {
1554 /* Accept floating point numbers. */
1555 if (c == '.') {
1556 fraction:
1557 /* Fraction */
1558 do {
1559 c = tok_nextc(tok);
1560 } while (isdigit(c));
1561 }
1562 if (c == 'e' || c == 'E') {
1563 exponent:
1564 /* Exponent part */
1565 c = tok_nextc(tok);
1566 if (c == '+' || c == '-')
1567 c = tok_nextc(tok);
1568 if (!isdigit(c)) {
1569 tok->done = E_TOKEN;
1570 tok_backup(tok, c);
1571 return ERRORTOKEN;
1572 }
1573 do {
1574 c = tok_nextc(tok);
1575 } while (isdigit(c));
1576 }
1577 if (c == 'j' || c == 'J')
1578 /* Imaginary part */
1579 imaginary:
1580 c = tok_nextc(tok);
1581 }
1582 }
1583 tok_backup(tok, c);
1584 *p_start = tok->start;
1585 *p_end = tok->cur;
1586 return NUMBER;
1587 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001588
1589 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 /* String */
1591 if (c == '\'' || c == '"') {
1592 int quote = c;
1593 int quote_size = 1; /* 1 or 3 */
1594 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001595
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 /* Find the quote size and start of string */
1597 c = tok_nextc(tok);
1598 if (c == quote) {
1599 c = tok_nextc(tok);
1600 if (c == quote)
1601 quote_size = 3;
1602 else
1603 end_quote_size = 1; /* empty string found */
1604 }
1605 if (c != quote)
1606 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001607
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001608 /* Get rest of string */
1609 while (end_quote_size != quote_size) {
1610 c = tok_nextc(tok);
1611 if (c == EOF) {
1612 if (quote_size == 3)
1613 tok->done = E_EOFS;
1614 else
1615 tok->done = E_EOLS;
1616 tok->cur = tok->inp;
1617 return ERRORTOKEN;
1618 }
1619 if (quote_size == 1 && c == '\n') {
1620 tok->done = E_EOLS;
1621 tok->cur = tok->inp;
1622 return ERRORTOKEN;
1623 }
1624 if (c == quote)
1625 end_quote_size += 1;
1626 else {
1627 end_quote_size = 0;
1628 if (c == '\\')
1629 c = tok_nextc(tok); /* skip escaped char */
1630 }
1631 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001632
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 *p_start = tok->start;
1634 *p_end = tok->cur;
1635 return STRING;
1636 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001637
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 /* Line continuation */
1639 if (c == '\\') {
1640 c = tok_nextc(tok);
1641 if (c != '\n') {
1642 tok->done = E_LINECONT;
1643 tok->cur = tok->inp;
1644 return ERRORTOKEN;
1645 }
1646 tok->cont_line = 1;
1647 goto again; /* Read next line */
1648 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001649
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 /* Check for two-character token */
1651 {
1652 int c2 = tok_nextc(tok);
1653 int token = PyToken_TwoChars(c, c2);
1654 if (token != OP) {
1655 int c3 = tok_nextc(tok);
1656 int token3 = PyToken_ThreeChars(c, c2, c3);
1657 if (token3 != OP) {
1658 token = token3;
1659 } else {
1660 tok_backup(tok, c3);
1661 }
1662 *p_start = tok->start;
1663 *p_end = tok->cur;
1664 return token;
1665 }
1666 tok_backup(tok, c2);
1667 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001668
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 /* Keep track of parentheses nesting level */
1670 switch (c) {
1671 case '(':
1672 case '[':
1673 case '{':
1674 tok->level++;
1675 break;
1676 case ')':
1677 case ']':
1678 case '}':
1679 tok->level--;
1680 break;
1681 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001682
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 /* Punctuation character */
1684 *p_start = tok->start;
1685 *p_end = tok->cur;
1686 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001687}
1688
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001689int
1690PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1691{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 int result = tok_get(tok, p_start, p_end);
1693 if (tok->decoding_erred) {
1694 result = ERRORTOKEN;
1695 tok->done = E_DECODE;
1696 }
1697 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001698}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001699
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001700/* Get the encoding of a Python file. Check for the coding cookie and check if
1701 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001702
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001703 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1704 encoding in the first or second line of the file (in which case the encoding
1705 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001706
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001707 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1708 by the caller. */
1709
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001710char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001711PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001712{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713 struct tok_state *tok;
1714 FILE *fp;
1715 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001716
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 fd = dup(fd);
1718 if (fd < 0) {
1719 return NULL;
1720 }
1721 fp = fdopen(fd, "r");
1722 if (fp == NULL) {
1723 return NULL;
1724 }
1725 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1726 if (tok == NULL) {
1727 fclose(fp);
1728 return NULL;
1729 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001730#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001731 if (filename != NULL) {
1732 Py_INCREF(filename);
1733 tok->filename = filename;
1734 }
1735 else {
1736 tok->filename = PyUnicode_FromString("<string>");
1737 if (tok->filename == NULL) {
1738 fclose(fp);
1739 PyTokenizer_Free(tok);
1740 return encoding;
1741 }
1742 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001743#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 while (tok->lineno < 2 && tok->done == E_OK) {
1745 PyTokenizer_Get(tok, &p_start, &p_end);
1746 }
1747 fclose(fp);
1748 if (tok->encoding) {
1749 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1750 if (encoding)
1751 strcpy(encoding, tok->encoding);
1752 }
1753 PyTokenizer_Free(tok);
1754 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001755}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001756
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001757char *
1758PyTokenizer_FindEncoding(int fd)
1759{
1760 return PyTokenizer_FindEncodingFilename(fd, NULL);
1761}
1762
Guido van Rossum408027e1996-12-30 16:17:54 +00001763#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001764
1765void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001766tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001767{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 printf("%s", _PyParser_TokenNames[type]);
1769 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1770 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001771}
1772
1773#endif