blob: c2c182ce195837000350cb766293f1ed042cc2ec [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200465 _Py_IDENTIFIER(open);
466 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000467 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 io = PyImport_ImportModuleNoBlock("io");
470 if (io == NULL)
471 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472
Victor Stinner22a351a2010-10-14 12:04:34 +0000473 fd = fileno(tok->fp);
474 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
475 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
476 goto cleanup;
477 }
478
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200479 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000480 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 if (stream == NULL)
482 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000483
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000484 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200485 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000487
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000488 /* The file has been reopened; parsing will restart from
489 * the beginning of the file, we have to reset the line number.
490 * But this function has been called from inside tok_nextc() which
491 * will increment lineno before it returns. So we set it -1 so that
492 * the next call to tok_nextc() will start with tok->lineno == 0.
493 */
494 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000495
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000496 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 Py_XDECREF(stream);
498 Py_XDECREF(io);
499 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500}
501
502/* Fetch the next byte from TOK. */
503
504static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506}
507
508/* Unfetch the last byte back into TOK. */
509
510static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000511 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512}
513
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000514/* Check whether the characters at s start a valid
515 UTF-8 sequence. Return the number of characters forming
516 the sequence if yes, 0 if not. */
517static int valid_utf8(const unsigned char* s)
518{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 int expected = 0;
520 int length;
521 if (*s < 0x80)
522 /* single-byte code */
523 return 1;
524 if (*s < 0xc0)
525 /* following byte */
526 return 0;
527 if (*s < 0xE0)
528 expected = 1;
529 else if (*s < 0xF0)
530 expected = 2;
531 else if (*s < 0xF8)
532 expected = 3;
533 else
534 return 0;
535 length = expected + 1;
536 for (; expected; expected--)
537 if (s[expected] < 0x80 || s[expected] >= 0xC0)
538 return 0;
539 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000540}
541
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542/* Read a line of input from TOK. Determine encoding
543 if necessary. */
544
545static char *
546decoding_fgets(char *s, int size, struct tok_state *tok)
547{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000548 char *line = NULL;
549 int badchar = 0;
550 for (;;) {
551 if (tok->decoding_state == STATE_NORMAL) {
552 /* We already have a codec associated with
553 this input. */
554 line = fp_readl(s, size, tok);
555 break;
556 } else if (tok->decoding_state == STATE_RAW) {
557 /* We want a 'raw' read. */
558 line = Py_UniversalNewlineFgets(s, size,
559 tok->fp, NULL);
560 break;
561 } else {
562 /* We have not yet determined the encoding.
563 If an encoding is found, use the file-pointer
564 reader functions from now on. */
565 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
566 return error_ret(tok);
567 assert(tok->decoding_state != STATE_INIT);
568 }
569 }
570 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
571 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
572 return error_ret(tok);
573 }
574 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000576 /* The default encoding is UTF-8, so make sure we don't have any
577 non-UTF-8 sequences in it. */
578 if (line && !tok->encoding) {
579 unsigned char *c;
580 int length;
581 for (c = (unsigned char *)line; *c; c += length)
582 if (!(length = valid_utf8(c))) {
583 badchar = *c;
584 break;
585 }
586 }
587 if (badchar) {
588 /* Need to add 1 to the line number, since this line
589 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200590 PyErr_Format(PyExc_SyntaxError,
591 "Non-UTF-8 code starting with '\\x%.2x' "
592 "in file %U on line %i, "
593 "but no encoding declared; "
594 "see http://python.org/dev/peps/pep-0263/ for details",
595 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000596 return error_ret(tok);
597 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000599 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600}
601
602static int
603decoding_feof(struct tok_state *tok)
604{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000605 if (tok->decoding_state != STATE_NORMAL) {
606 return feof(tok->fp);
607 } else {
608 PyObject* buf = tok->decoding_buffer;
609 if (buf == NULL) {
610 buf = PyObject_CallObject(tok->decoding_readline, NULL);
611 if (buf == NULL) {
612 error_ret(tok);
613 return 1;
614 } else {
615 tok->decoding_buffer = buf;
616 }
617 }
618 return PyObject_Length(buf) == 0;
619 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000620}
621
622/* Fetch a byte from TOK, using the string buffer. */
623
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000624static int
625buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627}
628
629/* Unfetch a byte from TOK, using the string buffer. */
630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000631static void
632buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 tok->str--;
634 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635}
636
637/* Set the readline function for TOK to ENC. For the string-based
638 tokenizer, this means to just record the encoding. */
639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000640static int
641buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 tok->enc = enc;
643 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644}
645
646/* Return a UTF-8 encoding Python string object from the
647 C byte string STR, which is encoded with ENC. */
648
649static PyObject *
650translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 PyObject *utf8;
652 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
653 if (buf == NULL)
654 return NULL;
655 utf8 = PyUnicode_AsUTF8String(buf);
656 Py_DECREF(buf);
657 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658}
659
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000660
661static char *
662translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
664 char *buf, *current;
665 char c = '\0';
666 buf = PyMem_MALLOC(needed_length);
667 if (buf == NULL) {
668 tok->done = E_NOMEM;
669 return NULL;
670 }
671 for (current = buf; *s; s++, current++) {
672 c = *s;
673 if (skip_next_lf) {
674 skip_next_lf = 0;
675 if (c == '\n') {
676 c = *++s;
677 if (!c)
678 break;
679 }
680 }
681 if (c == '\r') {
682 skip_next_lf = 1;
683 c = '\n';
684 }
685 *current = c;
686 }
687 /* If this is exec input, add a newline to the end of the string if
688 there isn't one already. */
689 if (exec_input && c != '\n') {
690 *current = '\n';
691 current++;
692 }
693 *current = '\0';
694 final_length = current - buf + 1;
695 if (final_length < needed_length && final_length)
696 /* should never fail */
697 buf = PyMem_REALLOC(buf, final_length);
698 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000699}
700
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000701/* Decode a byte string STR for use as the buffer of TOK.
702 Look for encoding declarations inside STR, and record them
703 inside TOK. */
704
705static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000706decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000707{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 PyObject* utf8 = NULL;
709 const char *str;
710 const char *s;
711 const char *newl[2] = {NULL, NULL};
712 int lineno = 0;
713 tok->input = str = translate_newlines(input, single, tok);
714 if (str == NULL)
715 return NULL;
716 tok->enc = NULL;
717 tok->str = str;
718 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
719 return error_ret(tok);
720 str = tok->str; /* string after BOM if any */
721 assert(str);
722 if (tok->enc != NULL) {
723 utf8 = translate_into_utf8(str, tok->enc);
724 if (utf8 == NULL)
725 return error_ret(tok);
726 str = PyBytes_AsString(utf8);
727 }
728 for (s = str;; s++) {
729 if (*s == '\0') break;
730 else if (*s == '\n') {
731 assert(lineno < 2);
732 newl[lineno] = s;
733 lineno++;
734 if (lineno == 2) break;
735 }
736 }
737 tok->enc = NULL;
738 /* need to check line 1 and 2 separately since check_coding_spec
739 assumes a single line as input */
740 if (newl[0]) {
741 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
742 return error_ret(tok);
743 if (tok->enc == NULL && newl[1]) {
744 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
745 tok, buf_setreadl))
746 return error_ret(tok);
747 }
748 }
749 if (tok->enc != NULL) {
750 assert(utf8 == NULL);
751 utf8 = translate_into_utf8(str, tok->enc);
752 if (utf8 == NULL)
753 return error_ret(tok);
754 str = PyBytes_AS_STRING(utf8);
755 }
756 assert(tok->decoding_buffer == NULL);
757 tok->decoding_buffer = utf8; /* CAUTION */
758 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000759}
760
761#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762
763/* Set up tokenizer for string */
764
765struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000766PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 struct tok_state *tok = tok_new();
769 if (tok == NULL)
770 return NULL;
771 str = (char *)decode_str(str, exec_input, tok);
772 if (str == NULL) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000776
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 /* XXX: constify members. */
778 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
779 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780}
781
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000782struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000783PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000784{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 struct tok_state *tok = tok_new();
786 if (tok == NULL)
787 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000788#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000790#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 if (str == NULL) {
792 PyTokenizer_Free(tok);
793 return NULL;
794 }
795 tok->decoding_state = STATE_RAW;
796 tok->read_coding_spec = 1;
797 tok->enc = NULL;
798 tok->str = str;
799 tok->encoding = (char *)PyMem_MALLOC(6);
800 if (!tok->encoding) {
801 PyTokenizer_Free(tok);
802 return NULL;
803 }
804 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000805
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 /* XXX: constify members. */
807 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
808 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000809}
810
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000811/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812
813struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000814PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000815{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 struct tok_state *tok = tok_new();
817 if (tok == NULL)
818 return NULL;
819 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
820 PyTokenizer_Free(tok);
821 return NULL;
822 }
823 tok->cur = tok->inp = tok->buf;
824 tok->end = tok->buf + BUFSIZ;
825 tok->fp = fp;
826 tok->prompt = ps1;
827 tok->nextprompt = ps2;
828 if (enc != NULL) {
829 /* Must copy encoding declaration since it
830 gets copied into the parse tree. */
831 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
832 if (!tok->encoding) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 strcpy(tok->encoding, enc);
837 tok->decoding_state = STATE_NORMAL;
838 }
839 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000840}
841
842
843/* Free a tok_state structure */
844
845void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000846PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 if (tok->encoding != NULL)
849 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000850#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000851 Py_XDECREF(tok->decoding_readline);
852 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200853 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000854#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000855 if (tok->fp != NULL && tok->buf != NULL)
856 PyMem_FREE(tok->buf);
857 if (tok->input)
858 PyMem_FREE((char *)tok->input);
859 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860}
861
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862/* Get next char, updating state; error code goes into tok->done */
863
864static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000865tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000866{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 for (;;) {
868 if (tok->cur != tok->inp) {
869 return Py_CHARMASK(*tok->cur++); /* Fast path */
870 }
871 if (tok->done != E_OK)
872 return EOF;
873 if (tok->fp == NULL) {
874 char *end = strchr(tok->inp, '\n');
875 if (end != NULL)
876 end++;
877 else {
878 end = strchr(tok->inp, '\0');
879 if (end == tok->inp) {
880 tok->done = E_EOF;
881 return EOF;
882 }
883 }
884 if (tok->start == NULL)
885 tok->buf = tok->cur;
886 tok->line_start = tok->cur;
887 tok->lineno++;
888 tok->inp = end;
889 return Py_CHARMASK(*tok->cur++);
890 }
891 if (tok->prompt != NULL) {
892 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000893#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000894 if (newtok != NULL) {
895 char *translated = translate_newlines(newtok, 0, tok);
896 PyMem_FREE(newtok);
897 if (translated == NULL)
898 return EOF;
899 newtok = translated;
900 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 if (tok->encoding && newtok && *newtok) {
902 /* Recode to UTF-8 */
903 Py_ssize_t buflen;
904 const char* buf;
905 PyObject *u = translate_into_utf8(newtok, tok->encoding);
906 PyMem_FREE(newtok);
907 if (!u) {
908 tok->done = E_DECODE;
909 return EOF;
910 }
911 buflen = PyBytes_GET_SIZE(u);
912 buf = PyBytes_AS_STRING(u);
913 if (!buf) {
914 Py_DECREF(u);
915 tok->done = E_DECODE;
916 return EOF;
917 }
918 newtok = PyMem_MALLOC(buflen+1);
919 strcpy(newtok, buf);
920 Py_DECREF(u);
921 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000922#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 if (tok->nextprompt != NULL)
924 tok->prompt = tok->nextprompt;
925 if (newtok == NULL)
926 tok->done = E_INTR;
927 else if (*newtok == '\0') {
928 PyMem_FREE(newtok);
929 tok->done = E_EOF;
930 }
931 else if (tok->start != NULL) {
932 size_t start = tok->start - tok->buf;
933 size_t oldlen = tok->cur - tok->buf;
934 size_t newlen = oldlen + strlen(newtok);
935 char *buf = tok->buf;
936 buf = (char *)PyMem_REALLOC(buf, newlen+1);
937 tok->lineno++;
938 if (buf == NULL) {
939 PyMem_FREE(tok->buf);
940 tok->buf = NULL;
941 PyMem_FREE(newtok);
942 tok->done = E_NOMEM;
943 return EOF;
944 }
945 tok->buf = buf;
946 tok->cur = tok->buf + oldlen;
947 tok->line_start = tok->cur;
948 strcpy(tok->buf + oldlen, newtok);
949 PyMem_FREE(newtok);
950 tok->inp = tok->buf + newlen;
951 tok->end = tok->inp + 1;
952 tok->start = tok->buf + start;
953 }
954 else {
955 tok->lineno++;
956 if (tok->buf != NULL)
957 PyMem_FREE(tok->buf);
958 tok->buf = newtok;
959 tok->line_start = tok->buf;
960 tok->cur = tok->buf;
961 tok->line_start = tok->buf;
962 tok->inp = strchr(tok->buf, '\0');
963 tok->end = tok->inp + 1;
964 }
965 }
966 else {
967 int done = 0;
968 Py_ssize_t cur = 0;
969 char *pt;
970 if (tok->start == NULL) {
971 if (tok->buf == NULL) {
972 tok->buf = (char *)
973 PyMem_MALLOC(BUFSIZ);
974 if (tok->buf == NULL) {
975 tok->done = E_NOMEM;
976 return EOF;
977 }
978 tok->end = tok->buf + BUFSIZ;
979 }
980 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
981 tok) == NULL) {
982 tok->done = E_EOF;
983 done = 1;
984 }
985 else {
986 tok->done = E_OK;
987 tok->inp = strchr(tok->buf, '\0');
988 done = tok->inp[-1] == '\n';
989 }
990 }
991 else {
992 cur = tok->cur - tok->buf;
993 if (decoding_feof(tok)) {
994 tok->done = E_EOF;
995 done = 1;
996 }
997 else
998 tok->done = E_OK;
999 }
1000 tok->lineno++;
1001 /* Read until '\n' or EOF */
1002 while (!done) {
1003 Py_ssize_t curstart = tok->start == NULL ? -1 :
1004 tok->start - tok->buf;
1005 Py_ssize_t curvalid = tok->inp - tok->buf;
1006 Py_ssize_t newsize = curvalid + BUFSIZ;
1007 char *newbuf = tok->buf;
1008 newbuf = (char *)PyMem_REALLOC(newbuf,
1009 newsize);
1010 if (newbuf == NULL) {
1011 tok->done = E_NOMEM;
1012 tok->cur = tok->inp;
1013 return EOF;
1014 }
1015 tok->buf = newbuf;
1016 tok->inp = tok->buf + curvalid;
1017 tok->end = tok->buf + newsize;
1018 tok->start = curstart < 0 ? NULL :
1019 tok->buf + curstart;
1020 if (decoding_fgets(tok->inp,
1021 (int)(tok->end - tok->inp),
1022 tok) == NULL) {
1023 /* Break out early on decoding
1024 errors, as tok->buf will be NULL
1025 */
1026 if (tok->decoding_erred)
1027 return EOF;
1028 /* Last line does not end in \n,
1029 fake one */
1030 strcpy(tok->inp, "\n");
1031 }
1032 tok->inp = strchr(tok->inp, '\0');
1033 done = tok->inp[-1] == '\n';
1034 }
1035 if (tok->buf != NULL) {
1036 tok->cur = tok->buf + cur;
1037 tok->line_start = tok->cur;
1038 /* replace "\r\n" with "\n" */
1039 /* For Mac leave the \r, giving a syntax error */
1040 pt = tok->inp - 2;
1041 if (pt >= tok->buf && *pt == '\r') {
1042 *pt++ = '\n';
1043 *pt = '\0';
1044 tok->inp = pt;
1045 }
1046 }
1047 }
1048 if (tok->done != E_OK) {
1049 if (tok->prompt != NULL)
1050 PySys_WriteStderr("\n");
1051 tok->cur = tok->inp;
1052 return EOF;
1053 }
1054 }
1055 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056}
1057
1058
1059/* Back-up one character */
1060
1061static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001062tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001063{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064 if (c != EOF) {
1065 if (--tok->cur < tok->buf)
1066 Py_FatalError("tok_backup: beginning of buffer");
1067 if (*tok->cur != c)
1068 *tok->cur = c;
1069 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001070}
1071
1072
1073/* Return the token corresponding to a single character */
1074
1075int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001076PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001077{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 switch (c) {
1079 case '(': return LPAR;
1080 case ')': return RPAR;
1081 case '[': return LSQB;
1082 case ']': return RSQB;
1083 case ':': return COLON;
1084 case ',': return COMMA;
1085 case ';': return SEMI;
1086 case '+': return PLUS;
1087 case '-': return MINUS;
1088 case '*': return STAR;
1089 case '/': return SLASH;
1090 case '|': return VBAR;
1091 case '&': return AMPER;
1092 case '<': return LESS;
1093 case '>': return GREATER;
1094 case '=': return EQUAL;
1095 case '.': return DOT;
1096 case '%': return PERCENT;
1097 case '{': return LBRACE;
1098 case '}': return RBRACE;
1099 case '^': return CIRCUMFLEX;
1100 case '~': return TILDE;
1101 case '@': return AT;
1102 default: return OP;
1103 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104}
1105
1106
Guido van Rossumfbab9051991-10-20 20:25:03 +00001107int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001108PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001109{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 switch (c1) {
1111 case '=':
1112 switch (c2) {
1113 case '=': return EQEQUAL;
1114 }
1115 break;
1116 case '!':
1117 switch (c2) {
1118 case '=': return NOTEQUAL;
1119 }
1120 break;
1121 case '<':
1122 switch (c2) {
1123 case '>': return NOTEQUAL;
1124 case '=': return LESSEQUAL;
1125 case '<': return LEFTSHIFT;
1126 }
1127 break;
1128 case '>':
1129 switch (c2) {
1130 case '=': return GREATEREQUAL;
1131 case '>': return RIGHTSHIFT;
1132 }
1133 break;
1134 case '+':
1135 switch (c2) {
1136 case '=': return PLUSEQUAL;
1137 }
1138 break;
1139 case '-':
1140 switch (c2) {
1141 case '=': return MINEQUAL;
1142 case '>': return RARROW;
1143 }
1144 break;
1145 case '*':
1146 switch (c2) {
1147 case '*': return DOUBLESTAR;
1148 case '=': return STAREQUAL;
1149 }
1150 break;
1151 case '/':
1152 switch (c2) {
1153 case '/': return DOUBLESLASH;
1154 case '=': return SLASHEQUAL;
1155 }
1156 break;
1157 case '|':
1158 switch (c2) {
1159 case '=': return VBAREQUAL;
1160 }
1161 break;
1162 case '%':
1163 switch (c2) {
1164 case '=': return PERCENTEQUAL;
1165 }
1166 break;
1167 case '&':
1168 switch (c2) {
1169 case '=': return AMPEREQUAL;
1170 }
1171 break;
1172 case '^':
1173 switch (c2) {
1174 case '=': return CIRCUMFLEXEQUAL;
1175 }
1176 break;
1177 }
1178 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001179}
1180
Thomas Wouters434d0822000-08-24 20:11:32 +00001181int
1182PyToken_ThreeChars(int c1, int c2, int c3)
1183{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001184 switch (c1) {
1185 case '<':
1186 switch (c2) {
1187 case '<':
1188 switch (c3) {
1189 case '=':
1190 return LEFTSHIFTEQUAL;
1191 }
1192 break;
1193 }
1194 break;
1195 case '>':
1196 switch (c2) {
1197 case '>':
1198 switch (c3) {
1199 case '=':
1200 return RIGHTSHIFTEQUAL;
1201 }
1202 break;
1203 }
1204 break;
1205 case '*':
1206 switch (c2) {
1207 case '*':
1208 switch (c3) {
1209 case '=':
1210 return DOUBLESTAREQUAL;
1211 }
1212 break;
1213 }
1214 break;
1215 case '/':
1216 switch (c2) {
1217 case '/':
1218 switch (c3) {
1219 case '=':
1220 return DOUBLESLASHEQUAL;
1221 }
1222 break;
1223 }
1224 break;
1225 case '.':
1226 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001227 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 switch (c3) {
1229 case '.':
1230 return ELLIPSIS;
1231 }
1232 break;
1233 }
1234 break;
1235 }
1236 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001237}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001238
Guido van Rossum926f13a1998-04-09 21:38:06 +00001239static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001240indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001241{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 if (tok->alterror) {
1243 tok->done = E_TABSPACE;
1244 tok->cur = tok->inp;
1245 return 1;
1246 }
1247 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001248#ifdef PGEN
1249 PySys_WriteStderr("inconsistent use of tabs and spaces "
1250 "in indentation\n");
1251#else
1252 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001254#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255 tok->altwarning = 0;
1256 }
1257 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001258}
1259
Martin v. Löwis47383402007-08-15 07:32:56 +00001260#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001261#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001262#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263/* Verify that the identifier follows PEP 3131.
1264 All identifier strings are guaranteed to be "ready" unicode objects.
1265 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001266static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001267verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001268{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 PyObject *s;
1270 int result;
1271 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1274 PyErr_Clear();
1275 tok->done = E_IDENTIFIER;
1276 } else {
1277 tok->done = E_ERROR;
1278 }
1279 return 0;
1280 }
1281 result = PyUnicode_IsIdentifier(s);
1282 Py_DECREF(s);
1283 if (result == 0)
1284 tok->done = E_IDENTIFIER;
1285 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001286}
1287#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001288
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289/* Get next token, after space stripping etc. */
1290
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001291static int
1292tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 register int c;
1295 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001296
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001298 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 tok->start = NULL;
1300 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001301
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 /* Get indentation level */
1303 if (tok->atbol) {
1304 register int col = 0;
1305 register int altcol = 0;
1306 tok->atbol = 0;
1307 for (;;) {
1308 c = tok_nextc(tok);
1309 if (c == ' ')
1310 col++, altcol++;
1311 else if (c == '\t') {
1312 col = (col/tok->tabsize + 1) * tok->tabsize;
1313 altcol = (altcol/tok->alttabsize + 1)
1314 * tok->alttabsize;
1315 }
1316 else if (c == '\014') /* Control-L (formfeed) */
1317 col = altcol = 0; /* For Emacs users */
1318 else
1319 break;
1320 }
1321 tok_backup(tok, c);
1322 if (c == '#' || c == '\n') {
1323 /* Lines with only whitespace and/or comments
1324 shouldn't affect the indentation and are
1325 not passed to the parser as NEWLINE tokens,
1326 except *totally* empty lines in interactive
1327 mode, which signal the end of a command group. */
1328 if (col == 0 && c == '\n' && tok->prompt != NULL)
1329 blankline = 0; /* Let it through */
1330 else
1331 blankline = 1; /* Ignore completely */
1332 /* We can't jump back right here since we still
1333 may need to skip to the end of a comment */
1334 }
1335 if (!blankline && tok->level == 0) {
1336 if (col == tok->indstack[tok->indent]) {
1337 /* No change */
1338 if (altcol != tok->altindstack[tok->indent]) {
1339 if (indenterror(tok))
1340 return ERRORTOKEN;
1341 }
1342 }
1343 else if (col > tok->indstack[tok->indent]) {
1344 /* Indent -- always one */
1345 if (tok->indent+1 >= MAXINDENT) {
1346 tok->done = E_TOODEEP;
1347 tok->cur = tok->inp;
1348 return ERRORTOKEN;
1349 }
1350 if (altcol <= tok->altindstack[tok->indent]) {
1351 if (indenterror(tok))
1352 return ERRORTOKEN;
1353 }
1354 tok->pendin++;
1355 tok->indstack[++tok->indent] = col;
1356 tok->altindstack[tok->indent] = altcol;
1357 }
1358 else /* col < tok->indstack[tok->indent] */ {
1359 /* Dedent -- any number, must be consistent */
1360 while (tok->indent > 0 &&
1361 col < tok->indstack[tok->indent]) {
1362 tok->pendin--;
1363 tok->indent--;
1364 }
1365 if (col != tok->indstack[tok->indent]) {
1366 tok->done = E_DEDENT;
1367 tok->cur = tok->inp;
1368 return ERRORTOKEN;
1369 }
1370 if (altcol != tok->altindstack[tok->indent]) {
1371 if (indenterror(tok))
1372 return ERRORTOKEN;
1373 }
1374 }
1375 }
1376 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 /* Return pending indents/dedents */
1381 if (tok->pendin != 0) {
1382 if (tok->pendin < 0) {
1383 tok->pendin++;
1384 return DEDENT;
1385 }
1386 else {
1387 tok->pendin--;
1388 return INDENT;
1389 }
1390 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001391
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001392 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 tok->start = NULL;
1394 /* Skip spaces */
1395 do {
1396 c = tok_nextc(tok);
1397 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001398
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399 /* Set start of current token */
1400 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 /* Skip comment */
1403 if (c == '#')
1404 while (c != EOF && c != '\n')
1405 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 /* Check for EOF and errors now */
1408 if (c == EOF) {
1409 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1410 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 /* Identifier (most frequent token!) */
1413 nonascii = 0;
1414 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001415 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001416 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001417 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001418 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001419 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001420 /* Since this is a backwards compatibility support literal we don't
1421 want to support it in arbitrary order like byte literals. */
1422 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1423 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001424 /* ur"" and ru"" are not supported */
1425 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001426 saw_r = 1;
1427 else
1428 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 c = tok_nextc(tok);
1430 if (c == '"' || c == '\'')
1431 goto letter_quote;
1432 }
1433 while (is_potential_identifier_char(c)) {
1434 if (c >= 128)
1435 nonascii = 1;
1436 c = tok_nextc(tok);
1437 }
1438 tok_backup(tok, c);
1439 if (nonascii &&
1440 !verify_identifier(tok)) {
1441 tok->done = E_IDENTIFIER;
1442 return ERRORTOKEN;
1443 }
1444 *p_start = tok->start;
1445 *p_end = tok->cur;
1446 return NAME;
1447 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 /* Newline */
1450 if (c == '\n') {
1451 tok->atbol = 1;
1452 if (blankline || tok->level > 0)
1453 goto nextline;
1454 *p_start = tok->start;
1455 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1456 tok->cont_line = 0;
1457 return NEWLINE;
1458 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 /* Period or number starting with period? */
1461 if (c == '.') {
1462 c = tok_nextc(tok);
1463 if (isdigit(c)) {
1464 goto fraction;
1465 } else if (c == '.') {
1466 c = tok_nextc(tok);
1467 if (c == '.') {
1468 *p_start = tok->start;
1469 *p_end = tok->cur;
1470 return ELLIPSIS;
1471 } else {
1472 tok_backup(tok, c);
1473 }
1474 tok_backup(tok, '.');
1475 } else {
1476 tok_backup(tok, c);
1477 }
1478 *p_start = tok->start;
1479 *p_end = tok->cur;
1480 return DOT;
1481 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001482
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 /* Number */
1484 if (isdigit(c)) {
1485 if (c == '0') {
1486 /* Hex, octal or binary -- maybe. */
1487 c = tok_nextc(tok);
1488 if (c == '.')
1489 goto fraction;
1490 if (c == 'j' || c == 'J')
1491 goto imaginary;
1492 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001493
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001494 /* Hex */
1495 c = tok_nextc(tok);
1496 if (!isxdigit(c)) {
1497 tok->done = E_TOKEN;
1498 tok_backup(tok, c);
1499 return ERRORTOKEN;
1500 }
1501 do {
1502 c = tok_nextc(tok);
1503 } while (isxdigit(c));
1504 }
1505 else if (c == 'o' || c == 'O') {
1506 /* Octal */
1507 c = tok_nextc(tok);
1508 if (c < '0' || c >= '8') {
1509 tok->done = E_TOKEN;
1510 tok_backup(tok, c);
1511 return ERRORTOKEN;
1512 }
1513 do {
1514 c = tok_nextc(tok);
1515 } while ('0' <= c && c < '8');
1516 }
1517 else if (c == 'b' || c == 'B') {
1518 /* Binary */
1519 c = tok_nextc(tok);
1520 if (c != '0' && c != '1') {
1521 tok->done = E_TOKEN;
1522 tok_backup(tok, c);
1523 return ERRORTOKEN;
1524 }
1525 do {
1526 c = tok_nextc(tok);
1527 } while (c == '0' || c == '1');
1528 }
1529 else {
1530 int nonzero = 0;
1531 /* maybe old-style octal; c is first char of it */
1532 /* in any case, allow '0' as a literal */
1533 while (c == '0')
1534 c = tok_nextc(tok);
1535 while (isdigit(c)) {
1536 nonzero = 1;
1537 c = tok_nextc(tok);
1538 }
1539 if (c == '.')
1540 goto fraction;
1541 else if (c == 'e' || c == 'E')
1542 goto exponent;
1543 else if (c == 'j' || c == 'J')
1544 goto imaginary;
1545 else if (nonzero) {
1546 tok->done = E_TOKEN;
1547 tok_backup(tok, c);
1548 return ERRORTOKEN;
1549 }
1550 }
1551 }
1552 else {
1553 /* Decimal */
1554 do {
1555 c = tok_nextc(tok);
1556 } while (isdigit(c));
1557 {
1558 /* Accept floating point numbers. */
1559 if (c == '.') {
1560 fraction:
1561 /* Fraction */
1562 do {
1563 c = tok_nextc(tok);
1564 } while (isdigit(c));
1565 }
1566 if (c == 'e' || c == 'E') {
1567 exponent:
1568 /* Exponent part */
1569 c = tok_nextc(tok);
1570 if (c == '+' || c == '-')
1571 c = tok_nextc(tok);
1572 if (!isdigit(c)) {
1573 tok->done = E_TOKEN;
1574 tok_backup(tok, c);
1575 return ERRORTOKEN;
1576 }
1577 do {
1578 c = tok_nextc(tok);
1579 } while (isdigit(c));
1580 }
1581 if (c == 'j' || c == 'J')
1582 /* Imaginary part */
1583 imaginary:
1584 c = tok_nextc(tok);
1585 }
1586 }
1587 tok_backup(tok, c);
1588 *p_start = tok->start;
1589 *p_end = tok->cur;
1590 return NUMBER;
1591 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001592
1593 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 /* String */
1595 if (c == '\'' || c == '"') {
1596 int quote = c;
1597 int quote_size = 1; /* 1 or 3 */
1598 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001599
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 /* Find the quote size and start of string */
1601 c = tok_nextc(tok);
1602 if (c == quote) {
1603 c = tok_nextc(tok);
1604 if (c == quote)
1605 quote_size = 3;
1606 else
1607 end_quote_size = 1; /* empty string found */
1608 }
1609 if (c != quote)
1610 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001611
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 /* Get rest of string */
1613 while (end_quote_size != quote_size) {
1614 c = tok_nextc(tok);
1615 if (c == EOF) {
1616 if (quote_size == 3)
1617 tok->done = E_EOFS;
1618 else
1619 tok->done = E_EOLS;
1620 tok->cur = tok->inp;
1621 return ERRORTOKEN;
1622 }
1623 if (quote_size == 1 && c == '\n') {
1624 tok->done = E_EOLS;
1625 tok->cur = tok->inp;
1626 return ERRORTOKEN;
1627 }
1628 if (c == quote)
1629 end_quote_size += 1;
1630 else {
1631 end_quote_size = 0;
1632 if (c == '\\')
1633 c = tok_nextc(tok); /* skip escaped char */
1634 }
1635 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001636
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 *p_start = tok->start;
1638 *p_end = tok->cur;
1639 return STRING;
1640 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001641
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 /* Line continuation */
1643 if (c == '\\') {
1644 c = tok_nextc(tok);
1645 if (c != '\n') {
1646 tok->done = E_LINECONT;
1647 tok->cur = tok->inp;
1648 return ERRORTOKEN;
1649 }
1650 tok->cont_line = 1;
1651 goto again; /* Read next line */
1652 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Check for two-character token */
1655 {
1656 int c2 = tok_nextc(tok);
1657 int token = PyToken_TwoChars(c, c2);
1658 if (token != OP) {
1659 int c3 = tok_nextc(tok);
1660 int token3 = PyToken_ThreeChars(c, c2, c3);
1661 if (token3 != OP) {
1662 token = token3;
1663 } else {
1664 tok_backup(tok, c3);
1665 }
1666 *p_start = tok->start;
1667 *p_end = tok->cur;
1668 return token;
1669 }
1670 tok_backup(tok, c2);
1671 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001672
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 /* Keep track of parentheses nesting level */
1674 switch (c) {
1675 case '(':
1676 case '[':
1677 case '{':
1678 tok->level++;
1679 break;
1680 case ')':
1681 case ']':
1682 case '}':
1683 tok->level--;
1684 break;
1685 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001686
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 /* Punctuation character */
1688 *p_start = tok->start;
1689 *p_end = tok->cur;
1690 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001691}
1692
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001693int
1694PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1695{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 int result = tok_get(tok, p_start, p_end);
1697 if (tok->decoding_erred) {
1698 result = ERRORTOKEN;
1699 tok->done = E_DECODE;
1700 }
1701 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001702}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001703
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001704/* Get the encoding of a Python file. Check for the coding cookie and check if
1705 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001706
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001707 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1708 encoding in the first or second line of the file (in which case the encoding
1709 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001710
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001711 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1712 by the caller. */
1713
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001714char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001715PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001716{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 struct tok_state *tok;
1718 FILE *fp;
1719 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001720
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 fd = dup(fd);
1722 if (fd < 0) {
1723 return NULL;
1724 }
1725 fp = fdopen(fd, "r");
1726 if (fp == NULL) {
1727 return NULL;
1728 }
1729 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1730 if (tok == NULL) {
1731 fclose(fp);
1732 return NULL;
1733 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001734#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001735 if (filename != NULL) {
1736 Py_INCREF(filename);
1737 tok->filename = filename;
1738 }
1739 else {
1740 tok->filename = PyUnicode_FromString("<string>");
1741 if (tok->filename == NULL) {
1742 fclose(fp);
1743 PyTokenizer_Free(tok);
1744 return encoding;
1745 }
1746 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001747#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 while (tok->lineno < 2 && tok->done == E_OK) {
1749 PyTokenizer_Get(tok, &p_start, &p_end);
1750 }
1751 fclose(fp);
1752 if (tok->encoding) {
1753 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1754 if (encoding)
1755 strcpy(encoding, tok->encoding);
1756 }
1757 PyTokenizer_Free(tok);
1758 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001759}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001760
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001761char *
1762PyTokenizer_FindEncoding(int fd)
1763{
1764 return PyTokenizer_FindEncodingFilename(fd, NULL);
1765}
1766
Guido van Rossum408027e1996-12-30 16:17:54 +00001767#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001768
1769void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001770tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001771{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 printf("%s", _PyParser_TokenNames[type]);
1773 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1774 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001775}
1776
1777#endif