blob: a079c9d9f3722ba298bb3e929e2a5b454e9e564c [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
257 if (!r)
258 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 char* q = get_normal_name(r);
260 if (r != q) {
261 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700262 r = new_string(q, strlen(q), tok);
263 if (!r)
264 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 }
268 }
269 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700270 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271}
272
273/* Check whether the line contains a coding spec. If it does,
274 invoke the set_readline function for the new encoding.
275 This function receives the tok_state and the new encoding.
276 Return 1 on success, 0 on failure. */
277
278static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000279check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700282 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000283 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000284
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 if (tok->cont_line)
286 /* It's a continuation line, so it can't be a coding spec. */
287 return 1;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700288 if (!get_coding_spec(line, &cs, size, tok))
289 return 0;
290 if (!cs)
291 return 1;
292 tok->read_coding_spec = 1;
293 if (tok->encoding == NULL) {
294 assert(tok->decoding_state == STATE_RAW);
295 if (strcmp(cs, "utf-8") == 0) {
296 tok->encoding = cs;
297 } else {
298 r = set_readline(tok, cs);
299 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000300 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700301 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000302 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700303 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300304 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 "encoding problem: %s", cs);
306 PyMem_FREE(cs);
307 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700309 } else { /* then, compare cs with BOM */
310 r = (strcmp(tok->encoding, cs) == 0);
311 if (!r)
312 PyErr_Format(PyExc_SyntaxError,
313 "encoding problem: %s with BOM", cs);
314 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000317}
318
319/* See whether the file starts with a BOM. If it does,
320 invoke the set_readline function with the new encoding.
321 Return 1 on success, 0 on failure. */
322
323static int
324check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 void unget_char(int, struct tok_state *),
326 int set_readline(struct tok_state *, const char *),
327 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 int ch1, ch2, ch3;
330 ch1 = get_char(tok);
331 tok->decoding_state = STATE_RAW;
332 if (ch1 == EOF) {
333 return 1;
334 } else if (ch1 == 0xEF) {
335 ch2 = get_char(tok);
336 if (ch2 != 0xBB) {
337 unget_char(ch2, tok);
338 unget_char(ch1, tok);
339 return 1;
340 }
341 ch3 = get_char(tok);
342 if (ch3 != 0xBF) {
343 unget_char(ch3, tok);
344 unget_char(ch2, tok);
345 unget_char(ch1, tok);
346 return 1;
347 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000349 /* Disable support for UTF-16 BOMs until a decision
350 is made whether this needs to be supported. */
351 } else if (ch1 == 0xFE) {
352 ch2 = get_char(tok);
353 if (ch2 != 0xFF) {
354 unget_char(ch2, tok);
355 unget_char(ch1, tok);
356 return 1;
357 }
358 if (!set_readline(tok, "utf-16-be"))
359 return 0;
360 tok->decoding_state = STATE_NORMAL;
361 } else if (ch1 == 0xFF) {
362 ch2 = get_char(tok);
363 if (ch2 != 0xFE) {
364 unget_char(ch2, tok);
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (!set_readline(tok, "utf-16-le"))
369 return 0;
370 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 } else {
373 unget_char(ch1, tok);
374 return 1;
375 }
376 if (tok->encoding != NULL)
377 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700378 tok->encoding = new_string("utf-8", 5, tok);
379 if (!tok->encoding)
380 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 /* No need to set_readline: input is already utf-8 */
382 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383}
384
385/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 On entry, tok->decoding_buffer will be one of:
389 1) NULL: need to call tok->decoding_readline to get a new line
390 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000391 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000392 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 (in the s buffer) to copy entire contents of the line read
394 by tok->decoding_readline. tok->decoding_buffer has the overflow.
395 In this case, fp_readl is called in a loop (with an expanded buffer)
396 until the buffer ends with a '\n' (or until the end of the file is
397 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000398*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399
400static char *
401fp_readl(char *s, int size, struct tok_state *tok)
402{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 PyObject* bufobj;
404 const char *buf;
405 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000406
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 /* Ask for one less byte so we can terminate it */
408 assert(size > 0);
409 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000411 if (tok->decoding_buffer) {
412 bufobj = tok->decoding_buffer;
413 Py_INCREF(bufobj);
414 }
415 else
416 {
417 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
418 if (bufobj == NULL)
419 goto error;
420 }
421 if (PyUnicode_CheckExact(bufobj))
422 {
423 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
424 if (buf == NULL) {
425 goto error;
426 }
427 }
428 else
429 {
430 buf = PyByteArray_AsString(bufobj);
431 if (buf == NULL) {
432 goto error;
433 }
434 buflen = PyByteArray_GET_SIZE(bufobj);
435 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000437 Py_XDECREF(tok->decoding_buffer);
438 if (buflen > size) {
439 /* Too many chars, the rest goes into tok->decoding_buffer */
440 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
441 buflen-size);
442 if (tok->decoding_buffer == NULL)
443 goto error;
444 buflen = size;
445 }
446 else
447 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000449 memcpy(s, buf, buflen);
450 s[buflen] = '\0';
451 if (buflen == 0) /* EOF */
452 s = NULL;
453 Py_DECREF(bufobj);
454 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000455
456error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000457 Py_XDECREF(bufobj);
458 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459}
460
461/* Set the readline function for TOK to a StreamReader's
462 readline function. The StreamReader is named ENC.
463
464 This function is called from check_bom and check_coding_spec.
465
466 ENC is usually identical to the future value of tok->encoding,
467 except for the (currently unsupported) case of UTF-16.
468
469 Return 1 on success, 0 on failure. */
470
471static int
472fp_setreadl(struct tok_state *tok, const char* enc)
473{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200475 _Py_IDENTIFIER(open);
476 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000477 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 io = PyImport_ImportModuleNoBlock("io");
480 if (io == NULL)
481 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000482
Victor Stinner22a351a2010-10-14 12:04:34 +0000483 fd = fileno(tok->fp);
484 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
485 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
486 goto cleanup;
487 }
488
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200489 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000490 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000491 if (stream == NULL)
492 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000494 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200495 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000497
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 /* The file has been reopened; parsing will restart from
499 * the beginning of the file, we have to reset the line number.
500 * But this function has been called from inside tok_nextc() which
501 * will increment lineno before it returns. So we set it -1 so that
502 * the next call to tok_nextc() will start with tok->lineno == 0.
503 */
504 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000505
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000506 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507 Py_XDECREF(stream);
508 Py_XDECREF(io);
509 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510}
511
512/* Fetch the next byte from TOK. */
513
514static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000515 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516}
517
518/* Unfetch the last byte back into TOK. */
519
520static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000521 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000522}
523
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000524/* Check whether the characters at s start a valid
525 UTF-8 sequence. Return the number of characters forming
526 the sequence if yes, 0 if not. */
527static int valid_utf8(const unsigned char* s)
528{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000529 int expected = 0;
530 int length;
531 if (*s < 0x80)
532 /* single-byte code */
533 return 1;
534 if (*s < 0xc0)
535 /* following byte */
536 return 0;
537 if (*s < 0xE0)
538 expected = 1;
539 else if (*s < 0xF0)
540 expected = 2;
541 else if (*s < 0xF8)
542 expected = 3;
543 else
544 return 0;
545 length = expected + 1;
546 for (; expected; expected--)
547 if (s[expected] < 0x80 || s[expected] >= 0xC0)
548 return 0;
549 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000550}
551
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552/* Read a line of input from TOK. Determine encoding
553 if necessary. */
554
555static char *
556decoding_fgets(char *s, int size, struct tok_state *tok)
557{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000558 char *line = NULL;
559 int badchar = 0;
560 for (;;) {
561 if (tok->decoding_state == STATE_NORMAL) {
562 /* We already have a codec associated with
563 this input. */
564 line = fp_readl(s, size, tok);
565 break;
566 } else if (tok->decoding_state == STATE_RAW) {
567 /* We want a 'raw' read. */
568 line = Py_UniversalNewlineFgets(s, size,
569 tok->fp, NULL);
570 break;
571 } else {
572 /* We have not yet determined the encoding.
573 If an encoding is found, use the file-pointer
574 reader functions from now on. */
575 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
576 return error_ret(tok);
577 assert(tok->decoding_state != STATE_INIT);
578 }
579 }
580 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
581 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
582 return error_ret(tok);
583 }
584 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 /* The default encoding is UTF-8, so make sure we don't have any
587 non-UTF-8 sequences in it. */
588 if (line && !tok->encoding) {
589 unsigned char *c;
590 int length;
591 for (c = (unsigned char *)line; *c; c += length)
592 if (!(length = valid_utf8(c))) {
593 badchar = *c;
594 break;
595 }
596 }
597 if (badchar) {
598 /* Need to add 1 to the line number, since this line
599 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200600 PyErr_Format(PyExc_SyntaxError,
601 "Non-UTF-8 code starting with '\\x%.2x' "
602 "in file %U on line %i, "
603 "but no encoding declared; "
604 "see http://python.org/dev/peps/pep-0263/ for details",
605 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000606 return error_ret(tok);
607 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000609 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000610}
611
612static int
613decoding_feof(struct tok_state *tok)
614{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 if (tok->decoding_state != STATE_NORMAL) {
616 return feof(tok->fp);
617 } else {
618 PyObject* buf = tok->decoding_buffer;
619 if (buf == NULL) {
620 buf = PyObject_CallObject(tok->decoding_readline, NULL);
621 if (buf == NULL) {
622 error_ret(tok);
623 return 1;
624 } else {
625 tok->decoding_buffer = buf;
626 }
627 }
628 return PyObject_Length(buf) == 0;
629 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630}
631
632/* Fetch a byte from TOK, using the string buffer. */
633
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000634static int
635buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637}
638
639/* Unfetch a byte from TOK, using the string buffer. */
640
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000641static void
642buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 tok->str--;
644 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645}
646
647/* Set the readline function for TOK to ENC. For the string-based
648 tokenizer, this means to just record the encoding. */
649
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000650static int
651buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000652 tok->enc = enc;
653 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000654}
655
656/* Return a UTF-8 encoding Python string object from the
657 C byte string STR, which is encoded with ENC. */
658
659static PyObject *
660translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 PyObject *utf8;
662 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
663 if (buf == NULL)
664 return NULL;
665 utf8 = PyUnicode_AsUTF8String(buf);
666 Py_DECREF(buf);
667 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668}
669
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000670
671static char *
672translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
674 char *buf, *current;
675 char c = '\0';
676 buf = PyMem_MALLOC(needed_length);
677 if (buf == NULL) {
678 tok->done = E_NOMEM;
679 return NULL;
680 }
681 for (current = buf; *s; s++, current++) {
682 c = *s;
683 if (skip_next_lf) {
684 skip_next_lf = 0;
685 if (c == '\n') {
686 c = *++s;
687 if (!c)
688 break;
689 }
690 }
691 if (c == '\r') {
692 skip_next_lf = 1;
693 c = '\n';
694 }
695 *current = c;
696 }
697 /* If this is exec input, add a newline to the end of the string if
698 there isn't one already. */
699 if (exec_input && c != '\n') {
700 *current = '\n';
701 current++;
702 }
703 *current = '\0';
704 final_length = current - buf + 1;
705 if (final_length < needed_length && final_length)
706 /* should never fail */
707 buf = PyMem_REALLOC(buf, final_length);
708 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000709}
710
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000711/* Decode a byte string STR for use as the buffer of TOK.
712 Look for encoding declarations inside STR, and record them
713 inside TOK. */
714
715static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000716decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 PyObject* utf8 = NULL;
719 const char *str;
720 const char *s;
721 const char *newl[2] = {NULL, NULL};
722 int lineno = 0;
723 tok->input = str = translate_newlines(input, single, tok);
724 if (str == NULL)
725 return NULL;
726 tok->enc = NULL;
727 tok->str = str;
728 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
729 return error_ret(tok);
730 str = tok->str; /* string after BOM if any */
731 assert(str);
732 if (tok->enc != NULL) {
733 utf8 = translate_into_utf8(str, tok->enc);
734 if (utf8 == NULL)
735 return error_ret(tok);
736 str = PyBytes_AsString(utf8);
737 }
738 for (s = str;; s++) {
739 if (*s == '\0') break;
740 else if (*s == '\n') {
741 assert(lineno < 2);
742 newl[lineno] = s;
743 lineno++;
744 if (lineno == 2) break;
745 }
746 }
747 tok->enc = NULL;
748 /* need to check line 1 and 2 separately since check_coding_spec
749 assumes a single line as input */
750 if (newl[0]) {
751 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
752 return error_ret(tok);
753 if (tok->enc == NULL && newl[1]) {
754 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
755 tok, buf_setreadl))
756 return error_ret(tok);
757 }
758 }
759 if (tok->enc != NULL) {
760 assert(utf8 == NULL);
761 utf8 = translate_into_utf8(str, tok->enc);
762 if (utf8 == NULL)
763 return error_ret(tok);
764 str = PyBytes_AS_STRING(utf8);
765 }
766 assert(tok->decoding_buffer == NULL);
767 tok->decoding_buffer = utf8; /* CAUTION */
768 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000769}
770
771#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772
773/* Set up tokenizer for string */
774
775struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000776PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 struct tok_state *tok = tok_new();
779 if (tok == NULL)
780 return NULL;
781 str = (char *)decode_str(str, exec_input, tok);
782 if (str == NULL) {
783 PyTokenizer_Free(tok);
784 return NULL;
785 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000786
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 /* XXX: constify members. */
788 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
789 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000790}
791
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000792struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000793PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000794{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 struct tok_state *tok = tok_new();
796 if (tok == NULL)
797 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000798#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000800#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 if (str == NULL) {
802 PyTokenizer_Free(tok);
803 return NULL;
804 }
805 tok->decoding_state = STATE_RAW;
806 tok->read_coding_spec = 1;
807 tok->enc = NULL;
808 tok->str = str;
809 tok->encoding = (char *)PyMem_MALLOC(6);
810 if (!tok->encoding) {
811 PyTokenizer_Free(tok);
812 return NULL;
813 }
814 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000815
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 /* XXX: constify members. */
817 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
818 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000819}
820
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000821/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822
823struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000824PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000825{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 struct tok_state *tok = tok_new();
827 if (tok == NULL)
828 return NULL;
829 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
830 PyTokenizer_Free(tok);
831 return NULL;
832 }
833 tok->cur = tok->inp = tok->buf;
834 tok->end = tok->buf + BUFSIZ;
835 tok->fp = fp;
836 tok->prompt = ps1;
837 tok->nextprompt = ps2;
838 if (enc != NULL) {
839 /* Must copy encoding declaration since it
840 gets copied into the parse tree. */
841 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
842 if (!tok->encoding) {
843 PyTokenizer_Free(tok);
844 return NULL;
845 }
846 strcpy(tok->encoding, enc);
847 tok->decoding_state = STATE_NORMAL;
848 }
849 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000850}
851
852
853/* Free a tok_state structure */
854
855void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000856PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000858 if (tok->encoding != NULL)
859 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000860#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 Py_XDECREF(tok->decoding_readline);
862 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200863 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000864#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865 if (tok->fp != NULL && tok->buf != NULL)
866 PyMem_FREE(tok->buf);
867 if (tok->input)
868 PyMem_FREE((char *)tok->input);
869 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000870}
871
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000872/* Get next char, updating state; error code goes into tok->done */
873
874static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000875tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000876{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000877 for (;;) {
878 if (tok->cur != tok->inp) {
879 return Py_CHARMASK(*tok->cur++); /* Fast path */
880 }
881 if (tok->done != E_OK)
882 return EOF;
883 if (tok->fp == NULL) {
884 char *end = strchr(tok->inp, '\n');
885 if (end != NULL)
886 end++;
887 else {
888 end = strchr(tok->inp, '\0');
889 if (end == tok->inp) {
890 tok->done = E_EOF;
891 return EOF;
892 }
893 }
894 if (tok->start == NULL)
895 tok->buf = tok->cur;
896 tok->line_start = tok->cur;
897 tok->lineno++;
898 tok->inp = end;
899 return Py_CHARMASK(*tok->cur++);
900 }
901 if (tok->prompt != NULL) {
902 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000903#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000904 if (newtok != NULL) {
905 char *translated = translate_newlines(newtok, 0, tok);
906 PyMem_FREE(newtok);
907 if (translated == NULL)
908 return EOF;
909 newtok = translated;
910 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 if (tok->encoding && newtok && *newtok) {
912 /* Recode to UTF-8 */
913 Py_ssize_t buflen;
914 const char* buf;
915 PyObject *u = translate_into_utf8(newtok, tok->encoding);
916 PyMem_FREE(newtok);
917 if (!u) {
918 tok->done = E_DECODE;
919 return EOF;
920 }
921 buflen = PyBytes_GET_SIZE(u);
922 buf = PyBytes_AS_STRING(u);
923 if (!buf) {
924 Py_DECREF(u);
925 tok->done = E_DECODE;
926 return EOF;
927 }
928 newtok = PyMem_MALLOC(buflen+1);
929 strcpy(newtok, buf);
930 Py_DECREF(u);
931 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000932#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000933 if (tok->nextprompt != NULL)
934 tok->prompt = tok->nextprompt;
935 if (newtok == NULL)
936 tok->done = E_INTR;
937 else if (*newtok == '\0') {
938 PyMem_FREE(newtok);
939 tok->done = E_EOF;
940 }
941 else if (tok->start != NULL) {
942 size_t start = tok->start - tok->buf;
943 size_t oldlen = tok->cur - tok->buf;
944 size_t newlen = oldlen + strlen(newtok);
945 char *buf = tok->buf;
946 buf = (char *)PyMem_REALLOC(buf, newlen+1);
947 tok->lineno++;
948 if (buf == NULL) {
949 PyMem_FREE(tok->buf);
950 tok->buf = NULL;
951 PyMem_FREE(newtok);
952 tok->done = E_NOMEM;
953 return EOF;
954 }
955 tok->buf = buf;
956 tok->cur = tok->buf + oldlen;
957 tok->line_start = tok->cur;
958 strcpy(tok->buf + oldlen, newtok);
959 PyMem_FREE(newtok);
960 tok->inp = tok->buf + newlen;
961 tok->end = tok->inp + 1;
962 tok->start = tok->buf + start;
963 }
964 else {
965 tok->lineno++;
966 if (tok->buf != NULL)
967 PyMem_FREE(tok->buf);
968 tok->buf = newtok;
969 tok->line_start = tok->buf;
970 tok->cur = tok->buf;
971 tok->line_start = tok->buf;
972 tok->inp = strchr(tok->buf, '\0');
973 tok->end = tok->inp + 1;
974 }
975 }
976 else {
977 int done = 0;
978 Py_ssize_t cur = 0;
979 char *pt;
980 if (tok->start == NULL) {
981 if (tok->buf == NULL) {
982 tok->buf = (char *)
983 PyMem_MALLOC(BUFSIZ);
984 if (tok->buf == NULL) {
985 tok->done = E_NOMEM;
986 return EOF;
987 }
988 tok->end = tok->buf + BUFSIZ;
989 }
990 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
991 tok) == NULL) {
992 tok->done = E_EOF;
993 done = 1;
994 }
995 else {
996 tok->done = E_OK;
997 tok->inp = strchr(tok->buf, '\0');
998 done = tok->inp[-1] == '\n';
999 }
1000 }
1001 else {
1002 cur = tok->cur - tok->buf;
1003 if (decoding_feof(tok)) {
1004 tok->done = E_EOF;
1005 done = 1;
1006 }
1007 else
1008 tok->done = E_OK;
1009 }
1010 tok->lineno++;
1011 /* Read until '\n' or EOF */
1012 while (!done) {
1013 Py_ssize_t curstart = tok->start == NULL ? -1 :
1014 tok->start - tok->buf;
1015 Py_ssize_t curvalid = tok->inp - tok->buf;
1016 Py_ssize_t newsize = curvalid + BUFSIZ;
1017 char *newbuf = tok->buf;
1018 newbuf = (char *)PyMem_REALLOC(newbuf,
1019 newsize);
1020 if (newbuf == NULL) {
1021 tok->done = E_NOMEM;
1022 tok->cur = tok->inp;
1023 return EOF;
1024 }
1025 tok->buf = newbuf;
1026 tok->inp = tok->buf + curvalid;
1027 tok->end = tok->buf + newsize;
1028 tok->start = curstart < 0 ? NULL :
1029 tok->buf + curstart;
1030 if (decoding_fgets(tok->inp,
1031 (int)(tok->end - tok->inp),
1032 tok) == NULL) {
1033 /* Break out early on decoding
1034 errors, as tok->buf will be NULL
1035 */
1036 if (tok->decoding_erred)
1037 return EOF;
1038 /* Last line does not end in \n,
1039 fake one */
1040 strcpy(tok->inp, "\n");
1041 }
1042 tok->inp = strchr(tok->inp, '\0');
1043 done = tok->inp[-1] == '\n';
1044 }
1045 if (tok->buf != NULL) {
1046 tok->cur = tok->buf + cur;
1047 tok->line_start = tok->cur;
1048 /* replace "\r\n" with "\n" */
1049 /* For Mac leave the \r, giving a syntax error */
1050 pt = tok->inp - 2;
1051 if (pt >= tok->buf && *pt == '\r') {
1052 *pt++ = '\n';
1053 *pt = '\0';
1054 tok->inp = pt;
1055 }
1056 }
1057 }
1058 if (tok->done != E_OK) {
1059 if (tok->prompt != NULL)
1060 PySys_WriteStderr("\n");
1061 tok->cur = tok->inp;
1062 return EOF;
1063 }
1064 }
1065 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001066}
1067
1068
1069/* Back-up one character */
1070
1071static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001072tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001074 if (c != EOF) {
1075 if (--tok->cur < tok->buf)
1076 Py_FatalError("tok_backup: beginning of buffer");
1077 if (*tok->cur != c)
1078 *tok->cur = c;
1079 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001080}
1081
1082
1083/* Return the token corresponding to a single character */
1084
1085int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001086PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 switch (c) {
1089 case '(': return LPAR;
1090 case ')': return RPAR;
1091 case '[': return LSQB;
1092 case ']': return RSQB;
1093 case ':': return COLON;
1094 case ',': return COMMA;
1095 case ';': return SEMI;
1096 case '+': return PLUS;
1097 case '-': return MINUS;
1098 case '*': return STAR;
1099 case '/': return SLASH;
1100 case '|': return VBAR;
1101 case '&': return AMPER;
1102 case '<': return LESS;
1103 case '>': return GREATER;
1104 case '=': return EQUAL;
1105 case '.': return DOT;
1106 case '%': return PERCENT;
1107 case '{': return LBRACE;
1108 case '}': return RBRACE;
1109 case '^': return CIRCUMFLEX;
1110 case '~': return TILDE;
1111 case '@': return AT;
1112 default: return OP;
1113 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114}
1115
1116
Guido van Rossumfbab9051991-10-20 20:25:03 +00001117int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001118PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001119{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 switch (c1) {
1121 case '=':
1122 switch (c2) {
1123 case '=': return EQEQUAL;
1124 }
1125 break;
1126 case '!':
1127 switch (c2) {
1128 case '=': return NOTEQUAL;
1129 }
1130 break;
1131 case '<':
1132 switch (c2) {
1133 case '>': return NOTEQUAL;
1134 case '=': return LESSEQUAL;
1135 case '<': return LEFTSHIFT;
1136 }
1137 break;
1138 case '>':
1139 switch (c2) {
1140 case '=': return GREATEREQUAL;
1141 case '>': return RIGHTSHIFT;
1142 }
1143 break;
1144 case '+':
1145 switch (c2) {
1146 case '=': return PLUSEQUAL;
1147 }
1148 break;
1149 case '-':
1150 switch (c2) {
1151 case '=': return MINEQUAL;
1152 case '>': return RARROW;
1153 }
1154 break;
1155 case '*':
1156 switch (c2) {
1157 case '*': return DOUBLESTAR;
1158 case '=': return STAREQUAL;
1159 }
1160 break;
1161 case '/':
1162 switch (c2) {
1163 case '/': return DOUBLESLASH;
1164 case '=': return SLASHEQUAL;
1165 }
1166 break;
1167 case '|':
1168 switch (c2) {
1169 case '=': return VBAREQUAL;
1170 }
1171 break;
1172 case '%':
1173 switch (c2) {
1174 case '=': return PERCENTEQUAL;
1175 }
1176 break;
1177 case '&':
1178 switch (c2) {
1179 case '=': return AMPEREQUAL;
1180 }
1181 break;
1182 case '^':
1183 switch (c2) {
1184 case '=': return CIRCUMFLEXEQUAL;
1185 }
1186 break;
1187 }
1188 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001189}
1190
Thomas Wouters434d0822000-08-24 20:11:32 +00001191int
1192PyToken_ThreeChars(int c1, int c2, int c3)
1193{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001194 switch (c1) {
1195 case '<':
1196 switch (c2) {
1197 case '<':
1198 switch (c3) {
1199 case '=':
1200 return LEFTSHIFTEQUAL;
1201 }
1202 break;
1203 }
1204 break;
1205 case '>':
1206 switch (c2) {
1207 case '>':
1208 switch (c3) {
1209 case '=':
1210 return RIGHTSHIFTEQUAL;
1211 }
1212 break;
1213 }
1214 break;
1215 case '*':
1216 switch (c2) {
1217 case '*':
1218 switch (c3) {
1219 case '=':
1220 return DOUBLESTAREQUAL;
1221 }
1222 break;
1223 }
1224 break;
1225 case '/':
1226 switch (c2) {
1227 case '/':
1228 switch (c3) {
1229 case '=':
1230 return DOUBLESLASHEQUAL;
1231 }
1232 break;
1233 }
1234 break;
1235 case '.':
1236 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001237 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 switch (c3) {
1239 case '.':
1240 return ELLIPSIS;
1241 }
1242 break;
1243 }
1244 break;
1245 }
1246 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001247}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001248
Guido van Rossum926f13a1998-04-09 21:38:06 +00001249static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001250indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001251{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 if (tok->alterror) {
1253 tok->done = E_TABSPACE;
1254 tok->cur = tok->inp;
1255 return 1;
1256 }
1257 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001258#ifdef PGEN
1259 PySys_WriteStderr("inconsistent use of tabs and spaces "
1260 "in indentation\n");
1261#else
1262 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001264#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 tok->altwarning = 0;
1266 }
1267 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001268}
1269
Martin v. Löwis47383402007-08-15 07:32:56 +00001270#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001271#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001272#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273/* Verify that the identifier follows PEP 3131.
1274 All identifier strings are guaranteed to be "ready" unicode objects.
1275 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001276static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001277verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001278{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyObject *s;
1280 int result;
1281 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1284 PyErr_Clear();
1285 tok->done = E_IDENTIFIER;
1286 } else {
1287 tok->done = E_ERROR;
1288 }
1289 return 0;
1290 }
1291 result = PyUnicode_IsIdentifier(s);
1292 Py_DECREF(s);
1293 if (result == 0)
1294 tok->done = E_IDENTIFIER;
1295 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001296}
1297#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001298
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299/* Get next token, after space stripping etc. */
1300
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001301static int
1302tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001303{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 register int c;
1305 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001306
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001308 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 tok->start = NULL;
1310 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001311
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 /* Get indentation level */
1313 if (tok->atbol) {
1314 register int col = 0;
1315 register int altcol = 0;
1316 tok->atbol = 0;
1317 for (;;) {
1318 c = tok_nextc(tok);
1319 if (c == ' ')
1320 col++, altcol++;
1321 else if (c == '\t') {
1322 col = (col/tok->tabsize + 1) * tok->tabsize;
1323 altcol = (altcol/tok->alttabsize + 1)
1324 * tok->alttabsize;
1325 }
1326 else if (c == '\014') /* Control-L (formfeed) */
1327 col = altcol = 0; /* For Emacs users */
1328 else
1329 break;
1330 }
1331 tok_backup(tok, c);
1332 if (c == '#' || c == '\n') {
1333 /* Lines with only whitespace and/or comments
1334 shouldn't affect the indentation and are
1335 not passed to the parser as NEWLINE tokens,
1336 except *totally* empty lines in interactive
1337 mode, which signal the end of a command group. */
1338 if (col == 0 && c == '\n' && tok->prompt != NULL)
1339 blankline = 0; /* Let it through */
1340 else
1341 blankline = 1; /* Ignore completely */
1342 /* We can't jump back right here since we still
1343 may need to skip to the end of a comment */
1344 }
1345 if (!blankline && tok->level == 0) {
1346 if (col == tok->indstack[tok->indent]) {
1347 /* No change */
1348 if (altcol != tok->altindstack[tok->indent]) {
1349 if (indenterror(tok))
1350 return ERRORTOKEN;
1351 }
1352 }
1353 else if (col > tok->indstack[tok->indent]) {
1354 /* Indent -- always one */
1355 if (tok->indent+1 >= MAXINDENT) {
1356 tok->done = E_TOODEEP;
1357 tok->cur = tok->inp;
1358 return ERRORTOKEN;
1359 }
1360 if (altcol <= tok->altindstack[tok->indent]) {
1361 if (indenterror(tok))
1362 return ERRORTOKEN;
1363 }
1364 tok->pendin++;
1365 tok->indstack[++tok->indent] = col;
1366 tok->altindstack[tok->indent] = altcol;
1367 }
1368 else /* col < tok->indstack[tok->indent] */ {
1369 /* Dedent -- any number, must be consistent */
1370 while (tok->indent > 0 &&
1371 col < tok->indstack[tok->indent]) {
1372 tok->pendin--;
1373 tok->indent--;
1374 }
1375 if (col != tok->indstack[tok->indent]) {
1376 tok->done = E_DEDENT;
1377 tok->cur = tok->inp;
1378 return ERRORTOKEN;
1379 }
1380 if (altcol != tok->altindstack[tok->indent]) {
1381 if (indenterror(tok))
1382 return ERRORTOKEN;
1383 }
1384 }
1385 }
1386 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001387
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 /* Return pending indents/dedents */
1391 if (tok->pendin != 0) {
1392 if (tok->pendin < 0) {
1393 tok->pendin++;
1394 return DEDENT;
1395 }
1396 else {
1397 tok->pendin--;
1398 return INDENT;
1399 }
1400 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 tok->start = NULL;
1404 /* Skip spaces */
1405 do {
1406 c = tok_nextc(tok);
1407 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001408
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 /* Set start of current token */
1410 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 /* Skip comment */
1413 if (c == '#')
1414 while (c != EOF && c != '\n')
1415 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 /* Check for EOF and errors now */
1418 if (c == EOF) {
1419 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1420 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 /* Identifier (most frequent token!) */
1423 nonascii = 0;
1424 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001425 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001426 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001427 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001428 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001429 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001430 /* Since this is a backwards compatibility support literal we don't
1431 want to support it in arbitrary order like byte literals. */
1432 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1433 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001434 /* ur"" and ru"" are not supported */
1435 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001436 saw_r = 1;
1437 else
1438 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 c = tok_nextc(tok);
1440 if (c == '"' || c == '\'')
1441 goto letter_quote;
1442 }
1443 while (is_potential_identifier_char(c)) {
1444 if (c >= 128)
1445 nonascii = 1;
1446 c = tok_nextc(tok);
1447 }
1448 tok_backup(tok, c);
1449 if (nonascii &&
1450 !verify_identifier(tok)) {
1451 tok->done = E_IDENTIFIER;
1452 return ERRORTOKEN;
1453 }
1454 *p_start = tok->start;
1455 *p_end = tok->cur;
1456 return NAME;
1457 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001458
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001459 /* Newline */
1460 if (c == '\n') {
1461 tok->atbol = 1;
1462 if (blankline || tok->level > 0)
1463 goto nextline;
1464 *p_start = tok->start;
1465 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1466 tok->cont_line = 0;
1467 return NEWLINE;
1468 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001470 /* Period or number starting with period? */
1471 if (c == '.') {
1472 c = tok_nextc(tok);
1473 if (isdigit(c)) {
1474 goto fraction;
1475 } else if (c == '.') {
1476 c = tok_nextc(tok);
1477 if (c == '.') {
1478 *p_start = tok->start;
1479 *p_end = tok->cur;
1480 return ELLIPSIS;
1481 } else {
1482 tok_backup(tok, c);
1483 }
1484 tok_backup(tok, '.');
1485 } else {
1486 tok_backup(tok, c);
1487 }
1488 *p_start = tok->start;
1489 *p_end = tok->cur;
1490 return DOT;
1491 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Number */
1494 if (isdigit(c)) {
1495 if (c == '0') {
1496 /* Hex, octal or binary -- maybe. */
1497 c = tok_nextc(tok);
1498 if (c == '.')
1499 goto fraction;
1500 if (c == 'j' || c == 'J')
1501 goto imaginary;
1502 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001503
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 /* Hex */
1505 c = tok_nextc(tok);
1506 if (!isxdigit(c)) {
1507 tok->done = E_TOKEN;
1508 tok_backup(tok, c);
1509 return ERRORTOKEN;
1510 }
1511 do {
1512 c = tok_nextc(tok);
1513 } while (isxdigit(c));
1514 }
1515 else if (c == 'o' || c == 'O') {
1516 /* Octal */
1517 c = tok_nextc(tok);
1518 if (c < '0' || c >= '8') {
1519 tok->done = E_TOKEN;
1520 tok_backup(tok, c);
1521 return ERRORTOKEN;
1522 }
1523 do {
1524 c = tok_nextc(tok);
1525 } while ('0' <= c && c < '8');
1526 }
1527 else if (c == 'b' || c == 'B') {
1528 /* Binary */
1529 c = tok_nextc(tok);
1530 if (c != '0' && c != '1') {
1531 tok->done = E_TOKEN;
1532 tok_backup(tok, c);
1533 return ERRORTOKEN;
1534 }
1535 do {
1536 c = tok_nextc(tok);
1537 } while (c == '0' || c == '1');
1538 }
1539 else {
1540 int nonzero = 0;
1541 /* maybe old-style octal; c is first char of it */
1542 /* in any case, allow '0' as a literal */
1543 while (c == '0')
1544 c = tok_nextc(tok);
1545 while (isdigit(c)) {
1546 nonzero = 1;
1547 c = tok_nextc(tok);
1548 }
1549 if (c == '.')
1550 goto fraction;
1551 else if (c == 'e' || c == 'E')
1552 goto exponent;
1553 else if (c == 'j' || c == 'J')
1554 goto imaginary;
1555 else if (nonzero) {
1556 tok->done = E_TOKEN;
1557 tok_backup(tok, c);
1558 return ERRORTOKEN;
1559 }
1560 }
1561 }
1562 else {
1563 /* Decimal */
1564 do {
1565 c = tok_nextc(tok);
1566 } while (isdigit(c));
1567 {
1568 /* Accept floating point numbers. */
1569 if (c == '.') {
1570 fraction:
1571 /* Fraction */
1572 do {
1573 c = tok_nextc(tok);
1574 } while (isdigit(c));
1575 }
1576 if (c == 'e' || c == 'E') {
1577 exponent:
1578 /* Exponent part */
1579 c = tok_nextc(tok);
1580 if (c == '+' || c == '-')
1581 c = tok_nextc(tok);
1582 if (!isdigit(c)) {
1583 tok->done = E_TOKEN;
1584 tok_backup(tok, c);
1585 return ERRORTOKEN;
1586 }
1587 do {
1588 c = tok_nextc(tok);
1589 } while (isdigit(c));
1590 }
1591 if (c == 'j' || c == 'J')
1592 /* Imaginary part */
1593 imaginary:
1594 c = tok_nextc(tok);
1595 }
1596 }
1597 tok_backup(tok, c);
1598 *p_start = tok->start;
1599 *p_end = tok->cur;
1600 return NUMBER;
1601 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001602
1603 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 /* String */
1605 if (c == '\'' || c == '"') {
1606 int quote = c;
1607 int quote_size = 1; /* 1 or 3 */
1608 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001609
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 /* Find the quote size and start of string */
1611 c = tok_nextc(tok);
1612 if (c == quote) {
1613 c = tok_nextc(tok);
1614 if (c == quote)
1615 quote_size = 3;
1616 else
1617 end_quote_size = 1; /* empty string found */
1618 }
1619 if (c != quote)
1620 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001621
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 /* Get rest of string */
1623 while (end_quote_size != quote_size) {
1624 c = tok_nextc(tok);
1625 if (c == EOF) {
1626 if (quote_size == 3)
1627 tok->done = E_EOFS;
1628 else
1629 tok->done = E_EOLS;
1630 tok->cur = tok->inp;
1631 return ERRORTOKEN;
1632 }
1633 if (quote_size == 1 && c == '\n') {
1634 tok->done = E_EOLS;
1635 tok->cur = tok->inp;
1636 return ERRORTOKEN;
1637 }
1638 if (c == quote)
1639 end_quote_size += 1;
1640 else {
1641 end_quote_size = 0;
1642 if (c == '\\')
1643 c = tok_nextc(tok); /* skip escaped char */
1644 }
1645 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001646
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 *p_start = tok->start;
1648 *p_end = tok->cur;
1649 return STRING;
1650 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001651
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001652 /* Line continuation */
1653 if (c == '\\') {
1654 c = tok_nextc(tok);
1655 if (c != '\n') {
1656 tok->done = E_LINECONT;
1657 tok->cur = tok->inp;
1658 return ERRORTOKEN;
1659 }
1660 tok->cont_line = 1;
1661 goto again; /* Read next line */
1662 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001663
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 /* Check for two-character token */
1665 {
1666 int c2 = tok_nextc(tok);
1667 int token = PyToken_TwoChars(c, c2);
1668 if (token != OP) {
1669 int c3 = tok_nextc(tok);
1670 int token3 = PyToken_ThreeChars(c, c2, c3);
1671 if (token3 != OP) {
1672 token = token3;
1673 } else {
1674 tok_backup(tok, c3);
1675 }
1676 *p_start = tok->start;
1677 *p_end = tok->cur;
1678 return token;
1679 }
1680 tok_backup(tok, c2);
1681 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001682
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 /* Keep track of parentheses nesting level */
1684 switch (c) {
1685 case '(':
1686 case '[':
1687 case '{':
1688 tok->level++;
1689 break;
1690 case ')':
1691 case ']':
1692 case '}':
1693 tok->level--;
1694 break;
1695 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001696
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 /* Punctuation character */
1698 *p_start = tok->start;
1699 *p_end = tok->cur;
1700 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001701}
1702
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001703int
1704PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 int result = tok_get(tok, p_start, p_end);
1707 if (tok->decoding_erred) {
1708 result = ERRORTOKEN;
1709 tok->done = E_DECODE;
1710 }
1711 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001712}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001713
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001714/* Get the encoding of a Python file. Check for the coding cookie and check if
1715 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001716
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001717 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1718 encoding in the first or second line of the file (in which case the encoding
1719 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001720
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001721 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1722 by the caller. */
1723
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001724char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001725PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 struct tok_state *tok;
1728 FILE *fp;
1729 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001730
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001731 fd = dup(fd);
1732 if (fd < 0) {
1733 return NULL;
1734 }
1735 fp = fdopen(fd, "r");
1736 if (fp == NULL) {
1737 return NULL;
1738 }
1739 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1740 if (tok == NULL) {
1741 fclose(fp);
1742 return NULL;
1743 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001744#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001745 if (filename != NULL) {
1746 Py_INCREF(filename);
1747 tok->filename = filename;
1748 }
1749 else {
1750 tok->filename = PyUnicode_FromString("<string>");
1751 if (tok->filename == NULL) {
1752 fclose(fp);
1753 PyTokenizer_Free(tok);
1754 return encoding;
1755 }
1756 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001757#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 while (tok->lineno < 2 && tok->done == E_OK) {
1759 PyTokenizer_Get(tok, &p_start, &p_end);
1760 }
1761 fclose(fp);
1762 if (tok->encoding) {
1763 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1764 if (encoding)
1765 strcpy(encoding, tok->encoding);
1766 }
1767 PyTokenizer_Free(tok);
1768 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001769}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001770
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001771char *
1772PyTokenizer_FindEncoding(int fd)
1773{
1774 return PyTokenizer_FindEncodingFilename(fd, NULL);
1775}
1776
Guido van Rossum408027e1996-12-30 16:17:54 +00001777#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001778
1779void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001780tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001781{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001782 printf("%s", _PyParser_TokenNames[type]);
1783 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1784 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001785}
1786
1787#endif