blob: 62b1a91b87c915e5d1e83b1317bbc913c074155d [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300294 else {
295 PyErr_Format(PyExc_SyntaxError,
296 "encoding problem: %s", cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000297 PyMem_FREE(cs);
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300298 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000299 }
300 } else { /* then, compare cs with BOM */
301 r = (strcmp(tok->encoding, cs) == 0);
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300302 if (!r)
303 PyErr_Format(PyExc_SyntaxError,
304 "encoding problem: %s with BOM", cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 PyMem_FREE(cs);
306 }
307 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200465 _Py_IDENTIFIER(open);
466 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000467 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 io = PyImport_ImportModuleNoBlock("io");
470 if (io == NULL)
471 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472
Victor Stinner22a351a2010-10-14 12:04:34 +0000473 fd = fileno(tok->fp);
474 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
475 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
476 goto cleanup;
477 }
478
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200479 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000480 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 if (stream == NULL)
482 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000483
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000484 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200485 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000487
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000488 /* The file has been reopened; parsing will restart from
489 * the beginning of the file, we have to reset the line number.
490 * But this function has been called from inside tok_nextc() which
491 * will increment lineno before it returns. So we set it -1 so that
492 * the next call to tok_nextc() will start with tok->lineno == 0.
493 */
494 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000495
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000496 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 Py_XDECREF(stream);
498 Py_XDECREF(io);
499 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500}
501
502/* Fetch the next byte from TOK. */
503
504static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506}
507
508/* Unfetch the last byte back into TOK. */
509
510static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000511 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512}
513
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000514/* Check whether the characters at s start a valid
515 UTF-8 sequence. Return the number of characters forming
516 the sequence if yes, 0 if not. */
517static int valid_utf8(const unsigned char* s)
518{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 int expected = 0;
520 int length;
521 if (*s < 0x80)
522 /* single-byte code */
523 return 1;
524 if (*s < 0xc0)
525 /* following byte */
526 return 0;
527 if (*s < 0xE0)
528 expected = 1;
529 else if (*s < 0xF0)
530 expected = 2;
531 else if (*s < 0xF8)
532 expected = 3;
533 else
534 return 0;
535 length = expected + 1;
536 for (; expected; expected--)
537 if (s[expected] < 0x80 || s[expected] >= 0xC0)
538 return 0;
539 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000540}
541
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542/* Read a line of input from TOK. Determine encoding
543 if necessary. */
544
545static char *
546decoding_fgets(char *s, int size, struct tok_state *tok)
547{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000548 char *line = NULL;
549 int badchar = 0;
550 for (;;) {
551 if (tok->decoding_state == STATE_NORMAL) {
552 /* We already have a codec associated with
553 this input. */
554 line = fp_readl(s, size, tok);
555 break;
556 } else if (tok->decoding_state == STATE_RAW) {
557 /* We want a 'raw' read. */
558 line = Py_UniversalNewlineFgets(s, size,
559 tok->fp, NULL);
560 break;
561 } else {
562 /* We have not yet determined the encoding.
563 If an encoding is found, use the file-pointer
564 reader functions from now on. */
565 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
566 return error_ret(tok);
567 assert(tok->decoding_state != STATE_INIT);
568 }
569 }
570 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
571 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
572 return error_ret(tok);
573 }
574 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000576 /* The default encoding is UTF-8, so make sure we don't have any
577 non-UTF-8 sequences in it. */
578 if (line && !tok->encoding) {
579 unsigned char *c;
580 int length;
581 for (c = (unsigned char *)line; *c; c += length)
582 if (!(length = valid_utf8(c))) {
583 badchar = *c;
584 break;
585 }
586 }
587 if (badchar) {
588 /* Need to add 1 to the line number, since this line
589 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200590 PyErr_Format(PyExc_SyntaxError,
591 "Non-UTF-8 code starting with '\\x%.2x' "
592 "in file %U on line %i, "
593 "but no encoding declared; "
594 "see http://python.org/dev/peps/pep-0263/ for details",
595 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000596 return error_ret(tok);
597 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000599 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600}
601
602static int
603decoding_feof(struct tok_state *tok)
604{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000605 if (tok->decoding_state != STATE_NORMAL) {
606 return feof(tok->fp);
607 } else {
608 PyObject* buf = tok->decoding_buffer;
609 if (buf == NULL) {
610 buf = PyObject_CallObject(tok->decoding_readline, NULL);
611 if (buf == NULL) {
612 error_ret(tok);
613 return 1;
614 } else {
615 tok->decoding_buffer = buf;
616 }
617 }
618 return PyObject_Length(buf) == 0;
619 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000620}
621
622/* Fetch a byte from TOK, using the string buffer. */
623
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000624static int
625buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627}
628
629/* Unfetch a byte from TOK, using the string buffer. */
630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000631static void
632buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 tok->str--;
634 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635}
636
637/* Set the readline function for TOK to ENC. For the string-based
638 tokenizer, this means to just record the encoding. */
639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000640static int
641buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 tok->enc = enc;
643 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644}
645
646/* Return a UTF-8 encoding Python string object from the
647 C byte string STR, which is encoded with ENC. */
648
649static PyObject *
650translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 PyObject *utf8;
652 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
653 if (buf == NULL)
654 return NULL;
655 utf8 = PyUnicode_AsUTF8String(buf);
656 Py_DECREF(buf);
657 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658}
659
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000660
661static char *
662translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200663 int skip_next_lf = 0;
664 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 char *buf, *current;
666 char c = '\0';
667 buf = PyMem_MALLOC(needed_length);
668 if (buf == NULL) {
669 tok->done = E_NOMEM;
670 return NULL;
671 }
672 for (current = buf; *s; s++, current++) {
673 c = *s;
674 if (skip_next_lf) {
675 skip_next_lf = 0;
676 if (c == '\n') {
677 c = *++s;
678 if (!c)
679 break;
680 }
681 }
682 if (c == '\r') {
683 skip_next_lf = 1;
684 c = '\n';
685 }
686 *current = c;
687 }
688 /* If this is exec input, add a newline to the end of the string if
689 there isn't one already. */
690 if (exec_input && c != '\n') {
691 *current = '\n';
692 current++;
693 }
694 *current = '\0';
695 final_length = current - buf + 1;
696 if (final_length < needed_length && final_length)
697 /* should never fail */
698 buf = PyMem_REALLOC(buf, final_length);
699 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000700}
701
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702/* Decode a byte string STR for use as the buffer of TOK.
703 Look for encoding declarations inside STR, and record them
704 inside TOK. */
705
706static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000707decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000708{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 PyObject* utf8 = NULL;
710 const char *str;
711 const char *s;
712 const char *newl[2] = {NULL, NULL};
713 int lineno = 0;
714 tok->input = str = translate_newlines(input, single, tok);
715 if (str == NULL)
716 return NULL;
717 tok->enc = NULL;
718 tok->str = str;
719 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
720 return error_ret(tok);
721 str = tok->str; /* string after BOM if any */
722 assert(str);
723 if (tok->enc != NULL) {
724 utf8 = translate_into_utf8(str, tok->enc);
725 if (utf8 == NULL)
726 return error_ret(tok);
727 str = PyBytes_AsString(utf8);
728 }
729 for (s = str;; s++) {
730 if (*s == '\0') break;
731 else if (*s == '\n') {
732 assert(lineno < 2);
733 newl[lineno] = s;
734 lineno++;
735 if (lineno == 2) break;
736 }
737 }
738 tok->enc = NULL;
739 /* need to check line 1 and 2 separately since check_coding_spec
740 assumes a single line as input */
741 if (newl[0]) {
742 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
743 return error_ret(tok);
744 if (tok->enc == NULL && newl[1]) {
745 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
746 tok, buf_setreadl))
747 return error_ret(tok);
748 }
749 }
750 if (tok->enc != NULL) {
751 assert(utf8 == NULL);
752 utf8 = translate_into_utf8(str, tok->enc);
753 if (utf8 == NULL)
754 return error_ret(tok);
755 str = PyBytes_AS_STRING(utf8);
756 }
757 assert(tok->decoding_buffer == NULL);
758 tok->decoding_buffer = utf8; /* CAUTION */
759 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000760}
761
762#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763
764/* Set up tokenizer for string */
765
766struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000767PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 struct tok_state *tok = tok_new();
770 if (tok == NULL)
771 return NULL;
772 str = (char *)decode_str(str, exec_input, tok);
773 if (str == NULL) {
774 PyTokenizer_Free(tok);
775 return NULL;
776 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000777
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 /* XXX: constify members. */
779 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
780 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781}
782
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000783struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000784PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000785{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 struct tok_state *tok = tok_new();
787 if (tok == NULL)
788 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000789#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000791#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 if (str == NULL) {
793 PyTokenizer_Free(tok);
794 return NULL;
795 }
796 tok->decoding_state = STATE_RAW;
797 tok->read_coding_spec = 1;
798 tok->enc = NULL;
799 tok->str = str;
800 tok->encoding = (char *)PyMem_MALLOC(6);
801 if (!tok->encoding) {
802 PyTokenizer_Free(tok);
803 return NULL;
804 }
805 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000806
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 /* XXX: constify members. */
808 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
809 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000810}
811
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000812/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813
814struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000815PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 struct tok_state *tok = tok_new();
818 if (tok == NULL)
819 return NULL;
820 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
821 PyTokenizer_Free(tok);
822 return NULL;
823 }
824 tok->cur = tok->inp = tok->buf;
825 tok->end = tok->buf + BUFSIZ;
826 tok->fp = fp;
827 tok->prompt = ps1;
828 tok->nextprompt = ps2;
829 if (enc != NULL) {
830 /* Must copy encoding declaration since it
831 gets copied into the parse tree. */
832 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
833 if (!tok->encoding) {
834 PyTokenizer_Free(tok);
835 return NULL;
836 }
837 strcpy(tok->encoding, enc);
838 tok->decoding_state = STATE_NORMAL;
839 }
840 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000841}
842
843
844/* Free a tok_state structure */
845
846void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000847PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000848{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 if (tok->encoding != NULL)
850 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000851#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 Py_XDECREF(tok->decoding_readline);
853 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200854 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000855#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856 if (tok->fp != NULL && tok->buf != NULL)
857 PyMem_FREE(tok->buf);
858 if (tok->input)
859 PyMem_FREE((char *)tok->input);
860 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861}
862
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863/* Get next char, updating state; error code goes into tok->done */
864
865static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000866tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000867{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000868 for (;;) {
869 if (tok->cur != tok->inp) {
870 return Py_CHARMASK(*tok->cur++); /* Fast path */
871 }
872 if (tok->done != E_OK)
873 return EOF;
874 if (tok->fp == NULL) {
875 char *end = strchr(tok->inp, '\n');
876 if (end != NULL)
877 end++;
878 else {
879 end = strchr(tok->inp, '\0');
880 if (end == tok->inp) {
881 tok->done = E_EOF;
882 return EOF;
883 }
884 }
885 if (tok->start == NULL)
886 tok->buf = tok->cur;
887 tok->line_start = tok->cur;
888 tok->lineno++;
889 tok->inp = end;
890 return Py_CHARMASK(*tok->cur++);
891 }
892 if (tok->prompt != NULL) {
893 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000894#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000895 if (newtok != NULL) {
896 char *translated = translate_newlines(newtok, 0, tok);
897 PyMem_FREE(newtok);
898 if (translated == NULL)
899 return EOF;
900 newtok = translated;
901 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 if (tok->encoding && newtok && *newtok) {
903 /* Recode to UTF-8 */
904 Py_ssize_t buflen;
905 const char* buf;
906 PyObject *u = translate_into_utf8(newtok, tok->encoding);
907 PyMem_FREE(newtok);
908 if (!u) {
909 tok->done = E_DECODE;
910 return EOF;
911 }
912 buflen = PyBytes_GET_SIZE(u);
913 buf = PyBytes_AS_STRING(u);
914 if (!buf) {
915 Py_DECREF(u);
916 tok->done = E_DECODE;
917 return EOF;
918 }
919 newtok = PyMem_MALLOC(buflen+1);
920 strcpy(newtok, buf);
921 Py_DECREF(u);
922 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000923#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 if (tok->nextprompt != NULL)
925 tok->prompt = tok->nextprompt;
926 if (newtok == NULL)
927 tok->done = E_INTR;
928 else if (*newtok == '\0') {
929 PyMem_FREE(newtok);
930 tok->done = E_EOF;
931 }
932 else if (tok->start != NULL) {
933 size_t start = tok->start - tok->buf;
934 size_t oldlen = tok->cur - tok->buf;
935 size_t newlen = oldlen + strlen(newtok);
936 char *buf = tok->buf;
937 buf = (char *)PyMem_REALLOC(buf, newlen+1);
938 tok->lineno++;
939 if (buf == NULL) {
940 PyMem_FREE(tok->buf);
941 tok->buf = NULL;
942 PyMem_FREE(newtok);
943 tok->done = E_NOMEM;
944 return EOF;
945 }
946 tok->buf = buf;
947 tok->cur = tok->buf + oldlen;
948 tok->line_start = tok->cur;
949 strcpy(tok->buf + oldlen, newtok);
950 PyMem_FREE(newtok);
951 tok->inp = tok->buf + newlen;
952 tok->end = tok->inp + 1;
953 tok->start = tok->buf + start;
954 }
955 else {
956 tok->lineno++;
957 if (tok->buf != NULL)
958 PyMem_FREE(tok->buf);
959 tok->buf = newtok;
960 tok->line_start = tok->buf;
961 tok->cur = tok->buf;
962 tok->line_start = tok->buf;
963 tok->inp = strchr(tok->buf, '\0');
964 tok->end = tok->inp + 1;
965 }
966 }
967 else {
968 int done = 0;
969 Py_ssize_t cur = 0;
970 char *pt;
971 if (tok->start == NULL) {
972 if (tok->buf == NULL) {
973 tok->buf = (char *)
974 PyMem_MALLOC(BUFSIZ);
975 if (tok->buf == NULL) {
976 tok->done = E_NOMEM;
977 return EOF;
978 }
979 tok->end = tok->buf + BUFSIZ;
980 }
981 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
982 tok) == NULL) {
983 tok->done = E_EOF;
984 done = 1;
985 }
986 else {
987 tok->done = E_OK;
988 tok->inp = strchr(tok->buf, '\0');
989 done = tok->inp[-1] == '\n';
990 }
991 }
992 else {
993 cur = tok->cur - tok->buf;
994 if (decoding_feof(tok)) {
995 tok->done = E_EOF;
996 done = 1;
997 }
998 else
999 tok->done = E_OK;
1000 }
1001 tok->lineno++;
1002 /* Read until '\n' or EOF */
1003 while (!done) {
1004 Py_ssize_t curstart = tok->start == NULL ? -1 :
1005 tok->start - tok->buf;
1006 Py_ssize_t curvalid = tok->inp - tok->buf;
1007 Py_ssize_t newsize = curvalid + BUFSIZ;
1008 char *newbuf = tok->buf;
1009 newbuf = (char *)PyMem_REALLOC(newbuf,
1010 newsize);
1011 if (newbuf == NULL) {
1012 tok->done = E_NOMEM;
1013 tok->cur = tok->inp;
1014 return EOF;
1015 }
1016 tok->buf = newbuf;
1017 tok->inp = tok->buf + curvalid;
1018 tok->end = tok->buf + newsize;
1019 tok->start = curstart < 0 ? NULL :
1020 tok->buf + curstart;
1021 if (decoding_fgets(tok->inp,
1022 (int)(tok->end - tok->inp),
1023 tok) == NULL) {
1024 /* Break out early on decoding
1025 errors, as tok->buf will be NULL
1026 */
1027 if (tok->decoding_erred)
1028 return EOF;
1029 /* Last line does not end in \n,
1030 fake one */
1031 strcpy(tok->inp, "\n");
1032 }
1033 tok->inp = strchr(tok->inp, '\0');
1034 done = tok->inp[-1] == '\n';
1035 }
1036 if (tok->buf != NULL) {
1037 tok->cur = tok->buf + cur;
1038 tok->line_start = tok->cur;
1039 /* replace "\r\n" with "\n" */
1040 /* For Mac leave the \r, giving a syntax error */
1041 pt = tok->inp - 2;
1042 if (pt >= tok->buf && *pt == '\r') {
1043 *pt++ = '\n';
1044 *pt = '\0';
1045 tok->inp = pt;
1046 }
1047 }
1048 }
1049 if (tok->done != E_OK) {
1050 if (tok->prompt != NULL)
1051 PySys_WriteStderr("\n");
1052 tok->cur = tok->inp;
1053 return EOF;
1054 }
1055 }
1056 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057}
1058
1059
1060/* Back-up one character */
1061
1062static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001063tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 if (c != EOF) {
1066 if (--tok->cur < tok->buf)
1067 Py_FatalError("tok_backup: beginning of buffer");
1068 if (*tok->cur != c)
1069 *tok->cur = c;
1070 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001071}
1072
1073
1074/* Return the token corresponding to a single character */
1075
1076int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001077PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001078{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 switch (c) {
1080 case '(': return LPAR;
1081 case ')': return RPAR;
1082 case '[': return LSQB;
1083 case ']': return RSQB;
1084 case ':': return COLON;
1085 case ',': return COMMA;
1086 case ';': return SEMI;
1087 case '+': return PLUS;
1088 case '-': return MINUS;
1089 case '*': return STAR;
1090 case '/': return SLASH;
1091 case '|': return VBAR;
1092 case '&': return AMPER;
1093 case '<': return LESS;
1094 case '>': return GREATER;
1095 case '=': return EQUAL;
1096 case '.': return DOT;
1097 case '%': return PERCENT;
1098 case '{': return LBRACE;
1099 case '}': return RBRACE;
1100 case '^': return CIRCUMFLEX;
1101 case '~': return TILDE;
1102 case '@': return AT;
1103 default: return OP;
1104 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001105}
1106
1107
Guido van Rossumfbab9051991-10-20 20:25:03 +00001108int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001109PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 switch (c1) {
1112 case '=':
1113 switch (c2) {
1114 case '=': return EQEQUAL;
1115 }
1116 break;
1117 case '!':
1118 switch (c2) {
1119 case '=': return NOTEQUAL;
1120 }
1121 break;
1122 case '<':
1123 switch (c2) {
1124 case '>': return NOTEQUAL;
1125 case '=': return LESSEQUAL;
1126 case '<': return LEFTSHIFT;
1127 }
1128 break;
1129 case '>':
1130 switch (c2) {
1131 case '=': return GREATEREQUAL;
1132 case '>': return RIGHTSHIFT;
1133 }
1134 break;
1135 case '+':
1136 switch (c2) {
1137 case '=': return PLUSEQUAL;
1138 }
1139 break;
1140 case '-':
1141 switch (c2) {
1142 case '=': return MINEQUAL;
1143 case '>': return RARROW;
1144 }
1145 break;
1146 case '*':
1147 switch (c2) {
1148 case '*': return DOUBLESTAR;
1149 case '=': return STAREQUAL;
1150 }
1151 break;
1152 case '/':
1153 switch (c2) {
1154 case '/': return DOUBLESLASH;
1155 case '=': return SLASHEQUAL;
1156 }
1157 break;
1158 case '|':
1159 switch (c2) {
1160 case '=': return VBAREQUAL;
1161 }
1162 break;
1163 case '%':
1164 switch (c2) {
1165 case '=': return PERCENTEQUAL;
1166 }
1167 break;
1168 case '&':
1169 switch (c2) {
1170 case '=': return AMPEREQUAL;
1171 }
1172 break;
1173 case '^':
1174 switch (c2) {
1175 case '=': return CIRCUMFLEXEQUAL;
1176 }
1177 break;
1178 }
1179 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001180}
1181
Thomas Wouters434d0822000-08-24 20:11:32 +00001182int
1183PyToken_ThreeChars(int c1, int c2, int c3)
1184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 switch (c1) {
1186 case '<':
1187 switch (c2) {
1188 case '<':
1189 switch (c3) {
1190 case '=':
1191 return LEFTSHIFTEQUAL;
1192 }
1193 break;
1194 }
1195 break;
1196 case '>':
1197 switch (c2) {
1198 case '>':
1199 switch (c3) {
1200 case '=':
1201 return RIGHTSHIFTEQUAL;
1202 }
1203 break;
1204 }
1205 break;
1206 case '*':
1207 switch (c2) {
1208 case '*':
1209 switch (c3) {
1210 case '=':
1211 return DOUBLESTAREQUAL;
1212 }
1213 break;
1214 }
1215 break;
1216 case '/':
1217 switch (c2) {
1218 case '/':
1219 switch (c3) {
1220 case '=':
1221 return DOUBLESLASHEQUAL;
1222 }
1223 break;
1224 }
1225 break;
1226 case '.':
1227 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001228 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 switch (c3) {
1230 case '.':
1231 return ELLIPSIS;
1232 }
1233 break;
1234 }
1235 break;
1236 }
1237 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001238}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001239
Guido van Rossum926f13a1998-04-09 21:38:06 +00001240static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001241indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001242{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 if (tok->alterror) {
1244 tok->done = E_TABSPACE;
1245 tok->cur = tok->inp;
1246 return 1;
1247 }
1248 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001249#ifdef PGEN
1250 PySys_WriteStderr("inconsistent use of tabs and spaces "
1251 "in indentation\n");
1252#else
1253 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001255#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001256 tok->altwarning = 0;
1257 }
1258 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001259}
1260
Martin v. Löwis47383402007-08-15 07:32:56 +00001261#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001262#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001263#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264/* Verify that the identifier follows PEP 3131.
1265 All identifier strings are guaranteed to be "ready" unicode objects.
1266 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001267static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001268verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001269{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 PyObject *s;
1271 int result;
1272 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1275 PyErr_Clear();
1276 tok->done = E_IDENTIFIER;
1277 } else {
1278 tok->done = E_ERROR;
1279 }
1280 return 0;
1281 }
1282 result = PyUnicode_IsIdentifier(s);
1283 Py_DECREF(s);
1284 if (result == 0)
1285 tok->done = E_IDENTIFIER;
1286 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001287}
1288#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290/* Get next token, after space stripping etc. */
1291
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001292static int
1293tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 register int c;
1296 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001297
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001299 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 tok->start = NULL;
1301 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001302
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 /* Get indentation level */
1304 if (tok->atbol) {
1305 register int col = 0;
1306 register int altcol = 0;
1307 tok->atbol = 0;
1308 for (;;) {
1309 c = tok_nextc(tok);
1310 if (c == ' ')
1311 col++, altcol++;
1312 else if (c == '\t') {
1313 col = (col/tok->tabsize + 1) * tok->tabsize;
1314 altcol = (altcol/tok->alttabsize + 1)
1315 * tok->alttabsize;
1316 }
1317 else if (c == '\014') /* Control-L (formfeed) */
1318 col = altcol = 0; /* For Emacs users */
1319 else
1320 break;
1321 }
1322 tok_backup(tok, c);
1323 if (c == '#' || c == '\n') {
1324 /* Lines with only whitespace and/or comments
1325 shouldn't affect the indentation and are
1326 not passed to the parser as NEWLINE tokens,
1327 except *totally* empty lines in interactive
1328 mode, which signal the end of a command group. */
1329 if (col == 0 && c == '\n' && tok->prompt != NULL)
1330 blankline = 0; /* Let it through */
1331 else
1332 blankline = 1; /* Ignore completely */
1333 /* We can't jump back right here since we still
1334 may need to skip to the end of a comment */
1335 }
1336 if (!blankline && tok->level == 0) {
1337 if (col == tok->indstack[tok->indent]) {
1338 /* No change */
1339 if (altcol != tok->altindstack[tok->indent]) {
1340 if (indenterror(tok))
1341 return ERRORTOKEN;
1342 }
1343 }
1344 else if (col > tok->indstack[tok->indent]) {
1345 /* Indent -- always one */
1346 if (tok->indent+1 >= MAXINDENT) {
1347 tok->done = E_TOODEEP;
1348 tok->cur = tok->inp;
1349 return ERRORTOKEN;
1350 }
1351 if (altcol <= tok->altindstack[tok->indent]) {
1352 if (indenterror(tok))
1353 return ERRORTOKEN;
1354 }
1355 tok->pendin++;
1356 tok->indstack[++tok->indent] = col;
1357 tok->altindstack[tok->indent] = altcol;
1358 }
1359 else /* col < tok->indstack[tok->indent] */ {
1360 /* Dedent -- any number, must be consistent */
1361 while (tok->indent > 0 &&
1362 col < tok->indstack[tok->indent]) {
1363 tok->pendin--;
1364 tok->indent--;
1365 }
1366 if (col != tok->indstack[tok->indent]) {
1367 tok->done = E_DEDENT;
1368 tok->cur = tok->inp;
1369 return ERRORTOKEN;
1370 }
1371 if (altcol != tok->altindstack[tok->indent]) {
1372 if (indenterror(tok))
1373 return ERRORTOKEN;
1374 }
1375 }
1376 }
1377 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001380
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 /* Return pending indents/dedents */
1382 if (tok->pendin != 0) {
1383 if (tok->pendin < 0) {
1384 tok->pendin++;
1385 return DEDENT;
1386 }
1387 else {
1388 tok->pendin--;
1389 return INDENT;
1390 }
1391 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001392
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001393 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 tok->start = NULL;
1395 /* Skip spaces */
1396 do {
1397 c = tok_nextc(tok);
1398 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001399
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 /* Set start of current token */
1401 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 /* Skip comment */
1404 if (c == '#')
1405 while (c != EOF && c != '\n')
1406 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 /* Check for EOF and errors now */
1409 if (c == EOF) {
1410 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1411 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 /* Identifier (most frequent token!) */
1414 nonascii = 0;
1415 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001416 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001417 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001418 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001419 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001420 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001421 /* Since this is a backwards compatibility support literal we don't
1422 want to support it in arbitrary order like byte literals. */
1423 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1424 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001425 /* ur"" and ru"" are not supported */
1426 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001427 saw_r = 1;
1428 else
1429 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 c = tok_nextc(tok);
1431 if (c == '"' || c == '\'')
1432 goto letter_quote;
1433 }
1434 while (is_potential_identifier_char(c)) {
1435 if (c >= 128)
1436 nonascii = 1;
1437 c = tok_nextc(tok);
1438 }
1439 tok_backup(tok, c);
1440 if (nonascii &&
1441 !verify_identifier(tok)) {
1442 tok->done = E_IDENTIFIER;
1443 return ERRORTOKEN;
1444 }
1445 *p_start = tok->start;
1446 *p_end = tok->cur;
1447 return NAME;
1448 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001449
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 /* Newline */
1451 if (c == '\n') {
1452 tok->atbol = 1;
1453 if (blankline || tok->level > 0)
1454 goto nextline;
1455 *p_start = tok->start;
1456 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1457 tok->cont_line = 0;
1458 return NEWLINE;
1459 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 /* Period or number starting with period? */
1462 if (c == '.') {
1463 c = tok_nextc(tok);
1464 if (isdigit(c)) {
1465 goto fraction;
1466 } else if (c == '.') {
1467 c = tok_nextc(tok);
1468 if (c == '.') {
1469 *p_start = tok->start;
1470 *p_end = tok->cur;
1471 return ELLIPSIS;
1472 } else {
1473 tok_backup(tok, c);
1474 }
1475 tok_backup(tok, '.');
1476 } else {
1477 tok_backup(tok, c);
1478 }
1479 *p_start = tok->start;
1480 *p_end = tok->cur;
1481 return DOT;
1482 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001483
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001484 /* Number */
1485 if (isdigit(c)) {
1486 if (c == '0') {
1487 /* Hex, octal or binary -- maybe. */
1488 c = tok_nextc(tok);
1489 if (c == '.')
1490 goto fraction;
1491 if (c == 'j' || c == 'J')
1492 goto imaginary;
1493 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 /* Hex */
1496 c = tok_nextc(tok);
1497 if (!isxdigit(c)) {
1498 tok->done = E_TOKEN;
1499 tok_backup(tok, c);
1500 return ERRORTOKEN;
1501 }
1502 do {
1503 c = tok_nextc(tok);
1504 } while (isxdigit(c));
1505 }
1506 else if (c == 'o' || c == 'O') {
1507 /* Octal */
1508 c = tok_nextc(tok);
1509 if (c < '0' || c >= '8') {
1510 tok->done = E_TOKEN;
1511 tok_backup(tok, c);
1512 return ERRORTOKEN;
1513 }
1514 do {
1515 c = tok_nextc(tok);
1516 } while ('0' <= c && c < '8');
1517 }
1518 else if (c == 'b' || c == 'B') {
1519 /* Binary */
1520 c = tok_nextc(tok);
1521 if (c != '0' && c != '1') {
1522 tok->done = E_TOKEN;
1523 tok_backup(tok, c);
1524 return ERRORTOKEN;
1525 }
1526 do {
1527 c = tok_nextc(tok);
1528 } while (c == '0' || c == '1');
1529 }
1530 else {
1531 int nonzero = 0;
1532 /* maybe old-style octal; c is first char of it */
1533 /* in any case, allow '0' as a literal */
1534 while (c == '0')
1535 c = tok_nextc(tok);
1536 while (isdigit(c)) {
1537 nonzero = 1;
1538 c = tok_nextc(tok);
1539 }
1540 if (c == '.')
1541 goto fraction;
1542 else if (c == 'e' || c == 'E')
1543 goto exponent;
1544 else if (c == 'j' || c == 'J')
1545 goto imaginary;
1546 else if (nonzero) {
1547 tok->done = E_TOKEN;
1548 tok_backup(tok, c);
1549 return ERRORTOKEN;
1550 }
1551 }
1552 }
1553 else {
1554 /* Decimal */
1555 do {
1556 c = tok_nextc(tok);
1557 } while (isdigit(c));
1558 {
1559 /* Accept floating point numbers. */
1560 if (c == '.') {
1561 fraction:
1562 /* Fraction */
1563 do {
1564 c = tok_nextc(tok);
1565 } while (isdigit(c));
1566 }
1567 if (c == 'e' || c == 'E') {
1568 exponent:
1569 /* Exponent part */
1570 c = tok_nextc(tok);
1571 if (c == '+' || c == '-')
1572 c = tok_nextc(tok);
1573 if (!isdigit(c)) {
1574 tok->done = E_TOKEN;
1575 tok_backup(tok, c);
1576 return ERRORTOKEN;
1577 }
1578 do {
1579 c = tok_nextc(tok);
1580 } while (isdigit(c));
1581 }
1582 if (c == 'j' || c == 'J')
1583 /* Imaginary part */
1584 imaginary:
1585 c = tok_nextc(tok);
1586 }
1587 }
1588 tok_backup(tok, c);
1589 *p_start = tok->start;
1590 *p_end = tok->cur;
1591 return NUMBER;
1592 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001593
1594 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 /* String */
1596 if (c == '\'' || c == '"') {
1597 int quote = c;
1598 int quote_size = 1; /* 1 or 3 */
1599 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001600
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 /* Find the quote size and start of string */
1602 c = tok_nextc(tok);
1603 if (c == quote) {
1604 c = tok_nextc(tok);
1605 if (c == quote)
1606 quote_size = 3;
1607 else
1608 end_quote_size = 1; /* empty string found */
1609 }
1610 if (c != quote)
1611 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001612
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 /* Get rest of string */
1614 while (end_quote_size != quote_size) {
1615 c = tok_nextc(tok);
1616 if (c == EOF) {
1617 if (quote_size == 3)
1618 tok->done = E_EOFS;
1619 else
1620 tok->done = E_EOLS;
1621 tok->cur = tok->inp;
1622 return ERRORTOKEN;
1623 }
1624 if (quote_size == 1 && c == '\n') {
1625 tok->done = E_EOLS;
1626 tok->cur = tok->inp;
1627 return ERRORTOKEN;
1628 }
1629 if (c == quote)
1630 end_quote_size += 1;
1631 else {
1632 end_quote_size = 0;
1633 if (c == '\\')
1634 c = tok_nextc(tok); /* skip escaped char */
1635 }
1636 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001637
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 *p_start = tok->start;
1639 *p_end = tok->cur;
1640 return STRING;
1641 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001642
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 /* Line continuation */
1644 if (c == '\\') {
1645 c = tok_nextc(tok);
1646 if (c != '\n') {
1647 tok->done = E_LINECONT;
1648 tok->cur = tok->inp;
1649 return ERRORTOKEN;
1650 }
1651 tok->cont_line = 1;
1652 goto again; /* Read next line */
1653 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001654
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 /* Check for two-character token */
1656 {
1657 int c2 = tok_nextc(tok);
1658 int token = PyToken_TwoChars(c, c2);
1659 if (token != OP) {
1660 int c3 = tok_nextc(tok);
1661 int token3 = PyToken_ThreeChars(c, c2, c3);
1662 if (token3 != OP) {
1663 token = token3;
1664 } else {
1665 tok_backup(tok, c3);
1666 }
1667 *p_start = tok->start;
1668 *p_end = tok->cur;
1669 return token;
1670 }
1671 tok_backup(tok, c2);
1672 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001673
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 /* Keep track of parentheses nesting level */
1675 switch (c) {
1676 case '(':
1677 case '[':
1678 case '{':
1679 tok->level++;
1680 break;
1681 case ')':
1682 case ']':
1683 case '}':
1684 tok->level--;
1685 break;
1686 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001687
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 /* Punctuation character */
1689 *p_start = tok->start;
1690 *p_end = tok->cur;
1691 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001692}
1693
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001694int
1695PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1696{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 int result = tok_get(tok, p_start, p_end);
1698 if (tok->decoding_erred) {
1699 result = ERRORTOKEN;
1700 tok->done = E_DECODE;
1701 }
1702 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001703}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001704
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001705/* Get the encoding of a Python file. Check for the coding cookie and check if
1706 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001707
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001708 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1709 encoding in the first or second line of the file (in which case the encoding
1710 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001711
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001712 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1713 by the caller. */
1714
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001715char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001716PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001717{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 struct tok_state *tok;
1719 FILE *fp;
1720 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001721
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 fd = dup(fd);
1723 if (fd < 0) {
1724 return NULL;
1725 }
1726 fp = fdopen(fd, "r");
1727 if (fp == NULL) {
1728 return NULL;
1729 }
1730 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1731 if (tok == NULL) {
1732 fclose(fp);
1733 return NULL;
1734 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001735#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001736 if (filename != NULL) {
1737 Py_INCREF(filename);
1738 tok->filename = filename;
1739 }
1740 else {
1741 tok->filename = PyUnicode_FromString("<string>");
1742 if (tok->filename == NULL) {
1743 fclose(fp);
1744 PyTokenizer_Free(tok);
1745 return encoding;
1746 }
1747 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001748#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 while (tok->lineno < 2 && tok->done == E_OK) {
1750 PyTokenizer_Get(tok, &p_start, &p_end);
1751 }
1752 fclose(fp);
1753 if (tok->encoding) {
1754 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1755 if (encoding)
1756 strcpy(encoding, tok->encoding);
1757 }
1758 PyTokenizer_Free(tok);
1759 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001760}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001761
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001762char *
1763PyTokenizer_FindEncoding(int fd)
1764{
1765 return PyTokenizer_FindEncodingFilename(fd, NULL);
1766}
1767
Guido van Rossum408027e1996-12-30 16:17:54 +00001768#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001769
1770void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001771tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001772{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001773 printf("%s", _PyParser_TokenNames[type]);
1774 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1775 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001776}
1777
1778#endif