blob: 97990e9451b34260c0f57b0824799d7272cb0b9b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 if (tok->cont_line)
287 /* It's a continuation line, so it can't be a coding spec. */
288 return 1;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700289 if (!get_coding_spec(line, &cs, size, tok))
290 return 0;
291 if (!cs)
292 return 1;
293 tok->read_coding_spec = 1;
294 if (tok->encoding == NULL) {
295 assert(tok->decoding_state == STATE_RAW);
296 if (strcmp(cs, "utf-8") == 0) {
297 tok->encoding = cs;
298 } else {
299 r = set_readline(tok, cs);
300 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700302 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700304 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300305 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700306 "encoding problem: %s", cs);
307 PyMem_FREE(cs);
308 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000309 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700310 } else { /* then, compare cs with BOM */
311 r = (strcmp(tok->encoding, cs) == 0);
312 if (!r)
313 PyErr_Format(PyExc_SyntaxError,
314 "encoding problem: %s with BOM", cs);
315 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318}
319
320/* See whether the file starts with a BOM. If it does,
321 invoke the set_readline function with the new encoding.
322 Return 1 on success, 0 on failure. */
323
324static int
325check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 void unget_char(int, struct tok_state *),
327 int set_readline(struct tok_state *, const char *),
328 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000329{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 int ch1, ch2, ch3;
331 ch1 = get_char(tok);
332 tok->decoding_state = STATE_RAW;
333 if (ch1 == EOF) {
334 return 1;
335 } else if (ch1 == 0xEF) {
336 ch2 = get_char(tok);
337 if (ch2 != 0xBB) {
338 unget_char(ch2, tok);
339 unget_char(ch1, tok);
340 return 1;
341 }
342 ch3 = get_char(tok);
343 if (ch3 != 0xBF) {
344 unget_char(ch3, tok);
345 unget_char(ch2, tok);
346 unget_char(ch1, tok);
347 return 1;
348 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 /* Disable support for UTF-16 BOMs until a decision
351 is made whether this needs to be supported. */
352 } else if (ch1 == 0xFE) {
353 ch2 = get_char(tok);
354 if (ch2 != 0xFF) {
355 unget_char(ch2, tok);
356 unget_char(ch1, tok);
357 return 1;
358 }
359 if (!set_readline(tok, "utf-16-be"))
360 return 0;
361 tok->decoding_state = STATE_NORMAL;
362 } else if (ch1 == 0xFF) {
363 ch2 = get_char(tok);
364 if (ch2 != 0xFE) {
365 unget_char(ch2, tok);
366 unget_char(ch1, tok);
367 return 1;
368 }
369 if (!set_readline(tok, "utf-16-le"))
370 return 0;
371 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 } else {
374 unget_char(ch1, tok);
375 return 1;
376 }
377 if (tok->encoding != NULL)
378 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700379 tok->encoding = new_string("utf-8", 5, tok);
380 if (!tok->encoding)
381 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 /* No need to set_readline: input is already utf-8 */
383 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384}
385
386/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 On entry, tok->decoding_buffer will be one of:
390 1) NULL: need to call tok->decoding_readline to get a new line
391 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000393 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 (in the s buffer) to copy entire contents of the line read
395 by tok->decoding_readline. tok->decoding_buffer has the overflow.
396 In this case, fp_readl is called in a loop (with an expanded buffer)
397 until the buffer ends with a '\n' (or until the end of the file is
398 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400
401static char *
402fp_readl(char *s, int size, struct tok_state *tok)
403{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyObject* bufobj;
405 const char *buf;
406 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 /* Ask for one less byte so we can terminate it */
409 assert(size > 0);
410 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000412 if (tok->decoding_buffer) {
413 bufobj = tok->decoding_buffer;
414 Py_INCREF(bufobj);
415 }
416 else
417 {
418 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (bufobj == NULL)
420 goto error;
421 }
422 if (PyUnicode_CheckExact(bufobj))
423 {
424 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
425 if (buf == NULL) {
426 goto error;
427 }
428 }
429 else
430 {
431 buf = PyByteArray_AsString(bufobj);
432 if (buf == NULL) {
433 goto error;
434 }
435 buflen = PyByteArray_GET_SIZE(bufobj);
436 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438 Py_XDECREF(tok->decoding_buffer);
439 if (buflen > size) {
440 /* Too many chars, the rest goes into tok->decoding_buffer */
441 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
442 buflen-size);
443 if (tok->decoding_buffer == NULL)
444 goto error;
445 buflen = size;
446 }
447 else
448 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000449
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000450 memcpy(s, buf, buflen);
451 s[buflen] = '\0';
452 if (buflen == 0) /* EOF */
453 s = NULL;
454 Py_DECREF(bufobj);
455 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000456
457error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 Py_XDECREF(bufobj);
459 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460}
461
462/* Set the readline function for TOK to a StreamReader's
463 readline function. The StreamReader is named ENC.
464
465 This function is called from check_bom and check_coding_spec.
466
467 ENC is usually identical to the future value of tok->encoding,
468 except for the (currently unsupported) case of UTF-16.
469
470 Return 1 on success, 0 on failure. */
471
472static int
473fp_setreadl(struct tok_state *tok, const char* enc)
474{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200476 _Py_IDENTIFIER(open);
477 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000478 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 io = PyImport_ImportModuleNoBlock("io");
481 if (io == NULL)
482 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000483
Victor Stinner22a351a2010-10-14 12:04:34 +0000484 fd = fileno(tok->fp);
485 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
486 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
487 goto cleanup;
488 }
489
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200490 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000491 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 if (stream == NULL)
493 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200496 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 /* The file has been reopened; parsing will restart from
500 * the beginning of the file, we have to reset the line number.
501 * But this function has been called from inside tok_nextc() which
502 * will increment lineno before it returns. So we set it -1 so that
503 * the next call to tok_nextc() will start with tok->lineno == 0.
504 */
505 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000506
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000507 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 Py_XDECREF(stream);
509 Py_XDECREF(io);
510 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000511}
512
513/* Fetch the next byte from TOK. */
514
515static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000516 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517}
518
519/* Unfetch the last byte back into TOK. */
520
521static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523}
524
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000525/* Check whether the characters at s start a valid
526 UTF-8 sequence. Return the number of characters forming
527 the sequence if yes, 0 if not. */
528static int valid_utf8(const unsigned char* s)
529{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 int expected = 0;
531 int length;
532 if (*s < 0x80)
533 /* single-byte code */
534 return 1;
535 if (*s < 0xc0)
536 /* following byte */
537 return 0;
538 if (*s < 0xE0)
539 expected = 1;
540 else if (*s < 0xF0)
541 expected = 2;
542 else if (*s < 0xF8)
543 expected = 3;
544 else
545 return 0;
546 length = expected + 1;
547 for (; expected; expected--)
548 if (s[expected] < 0x80 || s[expected] >= 0xC0)
549 return 0;
550 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000551}
552
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553/* Read a line of input from TOK. Determine encoding
554 if necessary. */
555
556static char *
557decoding_fgets(char *s, int size, struct tok_state *tok)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 char *line = NULL;
560 int badchar = 0;
561 for (;;) {
562 if (tok->decoding_state == STATE_NORMAL) {
563 /* We already have a codec associated with
564 this input. */
565 line = fp_readl(s, size, tok);
566 break;
567 } else if (tok->decoding_state == STATE_RAW) {
568 /* We want a 'raw' read. */
569 line = Py_UniversalNewlineFgets(s, size,
570 tok->fp, NULL);
571 break;
572 } else {
573 /* We have not yet determined the encoding.
574 If an encoding is found, use the file-pointer
575 reader functions from now on. */
576 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
577 return error_ret(tok);
578 assert(tok->decoding_state != STATE_INIT);
579 }
580 }
581 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
582 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
583 return error_ret(tok);
584 }
585 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 /* The default encoding is UTF-8, so make sure we don't have any
588 non-UTF-8 sequences in it. */
589 if (line && !tok->encoding) {
590 unsigned char *c;
591 int length;
592 for (c = (unsigned char *)line; *c; c += length)
593 if (!(length = valid_utf8(c))) {
594 badchar = *c;
595 break;
596 }
597 }
598 if (badchar) {
599 /* Need to add 1 to the line number, since this line
600 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200601 PyErr_Format(PyExc_SyntaxError,
602 "Non-UTF-8 code starting with '\\x%.2x' "
603 "in file %U on line %i, "
604 "but no encoding declared; "
605 "see http://python.org/dev/peps/pep-0263/ for details",
606 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 return error_ret(tok);
608 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611}
612
613static int
614decoding_feof(struct tok_state *tok)
615{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 if (tok->decoding_state != STATE_NORMAL) {
617 return feof(tok->fp);
618 } else {
619 PyObject* buf = tok->decoding_buffer;
620 if (buf == NULL) {
621 buf = PyObject_CallObject(tok->decoding_readline, NULL);
622 if (buf == NULL) {
623 error_ret(tok);
624 return 1;
625 } else {
626 tok->decoding_buffer = buf;
627 }
628 }
629 return PyObject_Length(buf) == 0;
630 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633/* Fetch a byte from TOK, using the string buffer. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638}
639
640/* Unfetch a byte from TOK, using the string buffer. */
641
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000642static void
643buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 tok->str--;
645 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646}
647
648/* Set the readline function for TOK to ENC. For the string-based
649 tokenizer, this means to just record the encoding. */
650
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000651static int
652buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 tok->enc = enc;
654 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655}
656
657/* Return a UTF-8 encoding Python string object from the
658 C byte string STR, which is encoded with ENC. */
659
660static PyObject *
661translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662 PyObject *utf8;
663 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
664 if (buf == NULL)
665 return NULL;
666 utf8 = PyUnicode_AsUTF8String(buf);
667 Py_DECREF(buf);
668 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669}
670
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000671
672static char *
673translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200674 int skip_next_lf = 0;
675 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 char *buf, *current;
677 char c = '\0';
678 buf = PyMem_MALLOC(needed_length);
679 if (buf == NULL) {
680 tok->done = E_NOMEM;
681 return NULL;
682 }
683 for (current = buf; *s; s++, current++) {
684 c = *s;
685 if (skip_next_lf) {
686 skip_next_lf = 0;
687 if (c == '\n') {
688 c = *++s;
689 if (!c)
690 break;
691 }
692 }
693 if (c == '\r') {
694 skip_next_lf = 1;
695 c = '\n';
696 }
697 *current = c;
698 }
699 /* If this is exec input, add a newline to the end of the string if
700 there isn't one already. */
701 if (exec_input && c != '\n') {
702 *current = '\n';
703 current++;
704 }
705 *current = '\0';
706 final_length = current - buf + 1;
707 if (final_length < needed_length && final_length)
708 /* should never fail */
709 buf = PyMem_REALLOC(buf, final_length);
710 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000711}
712
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713/* Decode a byte string STR for use as the buffer of TOK.
714 Look for encoding declarations inside STR, and record them
715 inside TOK. */
716
717static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000718decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 PyObject* utf8 = NULL;
721 const char *str;
722 const char *s;
723 const char *newl[2] = {NULL, NULL};
724 int lineno = 0;
725 tok->input = str = translate_newlines(input, single, tok);
726 if (str == NULL)
727 return NULL;
728 tok->enc = NULL;
729 tok->str = str;
730 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
731 return error_ret(tok);
732 str = tok->str; /* string after BOM if any */
733 assert(str);
734 if (tok->enc != NULL) {
735 utf8 = translate_into_utf8(str, tok->enc);
736 if (utf8 == NULL)
737 return error_ret(tok);
738 str = PyBytes_AsString(utf8);
739 }
740 for (s = str;; s++) {
741 if (*s == '\0') break;
742 else if (*s == '\n') {
743 assert(lineno < 2);
744 newl[lineno] = s;
745 lineno++;
746 if (lineno == 2) break;
747 }
748 }
749 tok->enc = NULL;
750 /* need to check line 1 and 2 separately since check_coding_spec
751 assumes a single line as input */
752 if (newl[0]) {
753 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
754 return error_ret(tok);
755 if (tok->enc == NULL && newl[1]) {
756 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
757 tok, buf_setreadl))
758 return error_ret(tok);
759 }
760 }
761 if (tok->enc != NULL) {
762 assert(utf8 == NULL);
763 utf8 = translate_into_utf8(str, tok->enc);
764 if (utf8 == NULL)
765 return error_ret(tok);
766 str = PyBytes_AS_STRING(utf8);
767 }
768 assert(tok->decoding_buffer == NULL);
769 tok->decoding_buffer = utf8; /* CAUTION */
770 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000771}
772
773#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774
775/* Set up tokenizer for string */
776
777struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000778PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 struct tok_state *tok = tok_new();
781 if (tok == NULL)
782 return NULL;
783 str = (char *)decode_str(str, exec_input, tok);
784 if (str == NULL) {
785 PyTokenizer_Free(tok);
786 return NULL;
787 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000788
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 /* XXX: constify members. */
790 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
791 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000792}
793
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000794struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000795PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000796{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 struct tok_state *tok = tok_new();
798 if (tok == NULL)
799 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000800#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000802#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 if (str == NULL) {
804 PyTokenizer_Free(tok);
805 return NULL;
806 }
807 tok->decoding_state = STATE_RAW;
808 tok->read_coding_spec = 1;
809 tok->enc = NULL;
810 tok->str = str;
811 tok->encoding = (char *)PyMem_MALLOC(6);
812 if (!tok->encoding) {
813 PyTokenizer_Free(tok);
814 return NULL;
815 }
816 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000817
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 /* XXX: constify members. */
819 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
820 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000821}
822
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000823/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824
825struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000826PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000827{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 struct tok_state *tok = tok_new();
829 if (tok == NULL)
830 return NULL;
831 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
832 PyTokenizer_Free(tok);
833 return NULL;
834 }
835 tok->cur = tok->inp = tok->buf;
836 tok->end = tok->buf + BUFSIZ;
837 tok->fp = fp;
838 tok->prompt = ps1;
839 tok->nextprompt = ps2;
840 if (enc != NULL) {
841 /* Must copy encoding declaration since it
842 gets copied into the parse tree. */
843 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
844 if (!tok->encoding) {
845 PyTokenizer_Free(tok);
846 return NULL;
847 }
848 strcpy(tok->encoding, enc);
849 tok->decoding_state = STATE_NORMAL;
850 }
851 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852}
853
854
855/* Free a tok_state structure */
856
857void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000858PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 if (tok->encoding != NULL)
861 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000862#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000863 Py_XDECREF(tok->decoding_readline);
864 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200865 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000866#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 if (tok->fp != NULL && tok->buf != NULL)
868 PyMem_FREE(tok->buf);
869 if (tok->input)
870 PyMem_FREE((char *)tok->input);
871 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000872}
873
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000874/* Get next char, updating state; error code goes into tok->done */
875
876static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200877tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000878{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 for (;;) {
880 if (tok->cur != tok->inp) {
881 return Py_CHARMASK(*tok->cur++); /* Fast path */
882 }
883 if (tok->done != E_OK)
884 return EOF;
885 if (tok->fp == NULL) {
886 char *end = strchr(tok->inp, '\n');
887 if (end != NULL)
888 end++;
889 else {
890 end = strchr(tok->inp, '\0');
891 if (end == tok->inp) {
892 tok->done = E_EOF;
893 return EOF;
894 }
895 }
896 if (tok->start == NULL)
897 tok->buf = tok->cur;
898 tok->line_start = tok->cur;
899 tok->lineno++;
900 tok->inp = end;
901 return Py_CHARMASK(*tok->cur++);
902 }
903 if (tok->prompt != NULL) {
904 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000905#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000906 if (newtok != NULL) {
907 char *translated = translate_newlines(newtok, 0, tok);
908 PyMem_FREE(newtok);
909 if (translated == NULL)
910 return EOF;
911 newtok = translated;
912 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000913 if (tok->encoding && newtok && *newtok) {
914 /* Recode to UTF-8 */
915 Py_ssize_t buflen;
916 const char* buf;
917 PyObject *u = translate_into_utf8(newtok, tok->encoding);
918 PyMem_FREE(newtok);
919 if (!u) {
920 tok->done = E_DECODE;
921 return EOF;
922 }
923 buflen = PyBytes_GET_SIZE(u);
924 buf = PyBytes_AS_STRING(u);
925 if (!buf) {
926 Py_DECREF(u);
927 tok->done = E_DECODE;
928 return EOF;
929 }
930 newtok = PyMem_MALLOC(buflen+1);
931 strcpy(newtok, buf);
932 Py_DECREF(u);
933 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000934#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 if (tok->nextprompt != NULL)
936 tok->prompt = tok->nextprompt;
937 if (newtok == NULL)
938 tok->done = E_INTR;
939 else if (*newtok == '\0') {
940 PyMem_FREE(newtok);
941 tok->done = E_EOF;
942 }
943 else if (tok->start != NULL) {
944 size_t start = tok->start - tok->buf;
945 size_t oldlen = tok->cur - tok->buf;
946 size_t newlen = oldlen + strlen(newtok);
947 char *buf = tok->buf;
948 buf = (char *)PyMem_REALLOC(buf, newlen+1);
949 tok->lineno++;
950 if (buf == NULL) {
951 PyMem_FREE(tok->buf);
952 tok->buf = NULL;
953 PyMem_FREE(newtok);
954 tok->done = E_NOMEM;
955 return EOF;
956 }
957 tok->buf = buf;
958 tok->cur = tok->buf + oldlen;
959 tok->line_start = tok->cur;
960 strcpy(tok->buf + oldlen, newtok);
961 PyMem_FREE(newtok);
962 tok->inp = tok->buf + newlen;
963 tok->end = tok->inp + 1;
964 tok->start = tok->buf + start;
965 }
966 else {
967 tok->lineno++;
968 if (tok->buf != NULL)
969 PyMem_FREE(tok->buf);
970 tok->buf = newtok;
971 tok->line_start = tok->buf;
972 tok->cur = tok->buf;
973 tok->line_start = tok->buf;
974 tok->inp = strchr(tok->buf, '\0');
975 tok->end = tok->inp + 1;
976 }
977 }
978 else {
979 int done = 0;
980 Py_ssize_t cur = 0;
981 char *pt;
982 if (tok->start == NULL) {
983 if (tok->buf == NULL) {
984 tok->buf = (char *)
985 PyMem_MALLOC(BUFSIZ);
986 if (tok->buf == NULL) {
987 tok->done = E_NOMEM;
988 return EOF;
989 }
990 tok->end = tok->buf + BUFSIZ;
991 }
992 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
993 tok) == NULL) {
994 tok->done = E_EOF;
995 done = 1;
996 }
997 else {
998 tok->done = E_OK;
999 tok->inp = strchr(tok->buf, '\0');
1000 done = tok->inp[-1] == '\n';
1001 }
1002 }
1003 else {
1004 cur = tok->cur - tok->buf;
1005 if (decoding_feof(tok)) {
1006 tok->done = E_EOF;
1007 done = 1;
1008 }
1009 else
1010 tok->done = E_OK;
1011 }
1012 tok->lineno++;
1013 /* Read until '\n' or EOF */
1014 while (!done) {
1015 Py_ssize_t curstart = tok->start == NULL ? -1 :
1016 tok->start - tok->buf;
1017 Py_ssize_t curvalid = tok->inp - tok->buf;
1018 Py_ssize_t newsize = curvalid + BUFSIZ;
1019 char *newbuf = tok->buf;
1020 newbuf = (char *)PyMem_REALLOC(newbuf,
1021 newsize);
1022 if (newbuf == NULL) {
1023 tok->done = E_NOMEM;
1024 tok->cur = tok->inp;
1025 return EOF;
1026 }
1027 tok->buf = newbuf;
1028 tok->inp = tok->buf + curvalid;
1029 tok->end = tok->buf + newsize;
1030 tok->start = curstart < 0 ? NULL :
1031 tok->buf + curstart;
1032 if (decoding_fgets(tok->inp,
1033 (int)(tok->end - tok->inp),
1034 tok) == NULL) {
1035 /* Break out early on decoding
1036 errors, as tok->buf will be NULL
1037 */
1038 if (tok->decoding_erred)
1039 return EOF;
1040 /* Last line does not end in \n,
1041 fake one */
1042 strcpy(tok->inp, "\n");
1043 }
1044 tok->inp = strchr(tok->inp, '\0');
1045 done = tok->inp[-1] == '\n';
1046 }
1047 if (tok->buf != NULL) {
1048 tok->cur = tok->buf + cur;
1049 tok->line_start = tok->cur;
1050 /* replace "\r\n" with "\n" */
1051 /* For Mac leave the \r, giving a syntax error */
1052 pt = tok->inp - 2;
1053 if (pt >= tok->buf && *pt == '\r') {
1054 *pt++ = '\n';
1055 *pt = '\0';
1056 tok->inp = pt;
1057 }
1058 }
1059 }
1060 if (tok->done != E_OK) {
1061 if (tok->prompt != NULL)
1062 PySys_WriteStderr("\n");
1063 tok->cur = tok->inp;
1064 return EOF;
1065 }
1066 }
1067 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001068}
1069
1070
1071/* Back-up one character */
1072
1073static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001074tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001075{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001076 if (c != EOF) {
1077 if (--tok->cur < tok->buf)
1078 Py_FatalError("tok_backup: beginning of buffer");
1079 if (*tok->cur != c)
1080 *tok->cur = c;
1081 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001082}
1083
1084
1085/* Return the token corresponding to a single character */
1086
1087int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001088PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 switch (c) {
1091 case '(': return LPAR;
1092 case ')': return RPAR;
1093 case '[': return LSQB;
1094 case ']': return RSQB;
1095 case ':': return COLON;
1096 case ',': return COMMA;
1097 case ';': return SEMI;
1098 case '+': return PLUS;
1099 case '-': return MINUS;
1100 case '*': return STAR;
1101 case '/': return SLASH;
1102 case '|': return VBAR;
1103 case '&': return AMPER;
1104 case '<': return LESS;
1105 case '>': return GREATER;
1106 case '=': return EQUAL;
1107 case '.': return DOT;
1108 case '%': return PERCENT;
1109 case '{': return LBRACE;
1110 case '}': return RBRACE;
1111 case '^': return CIRCUMFLEX;
1112 case '~': return TILDE;
1113 case '@': return AT;
1114 default: return OP;
1115 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116}
1117
1118
Guido van Rossumfbab9051991-10-20 20:25:03 +00001119int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001120PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001121{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 switch (c1) {
1123 case '=':
1124 switch (c2) {
1125 case '=': return EQEQUAL;
1126 }
1127 break;
1128 case '!':
1129 switch (c2) {
1130 case '=': return NOTEQUAL;
1131 }
1132 break;
1133 case '<':
1134 switch (c2) {
1135 case '>': return NOTEQUAL;
1136 case '=': return LESSEQUAL;
1137 case '<': return LEFTSHIFT;
1138 }
1139 break;
1140 case '>':
1141 switch (c2) {
1142 case '=': return GREATEREQUAL;
1143 case '>': return RIGHTSHIFT;
1144 }
1145 break;
1146 case '+':
1147 switch (c2) {
1148 case '=': return PLUSEQUAL;
1149 }
1150 break;
1151 case '-':
1152 switch (c2) {
1153 case '=': return MINEQUAL;
1154 case '>': return RARROW;
1155 }
1156 break;
1157 case '*':
1158 switch (c2) {
1159 case '*': return DOUBLESTAR;
1160 case '=': return STAREQUAL;
1161 }
1162 break;
1163 case '/':
1164 switch (c2) {
1165 case '/': return DOUBLESLASH;
1166 case '=': return SLASHEQUAL;
1167 }
1168 break;
1169 case '|':
1170 switch (c2) {
1171 case '=': return VBAREQUAL;
1172 }
1173 break;
1174 case '%':
1175 switch (c2) {
1176 case '=': return PERCENTEQUAL;
1177 }
1178 break;
1179 case '&':
1180 switch (c2) {
1181 case '=': return AMPEREQUAL;
1182 }
1183 break;
1184 case '^':
1185 switch (c2) {
1186 case '=': return CIRCUMFLEXEQUAL;
1187 }
1188 break;
1189 }
1190 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001191}
1192
Thomas Wouters434d0822000-08-24 20:11:32 +00001193int
1194PyToken_ThreeChars(int c1, int c2, int c3)
1195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196 switch (c1) {
1197 case '<':
1198 switch (c2) {
1199 case '<':
1200 switch (c3) {
1201 case '=':
1202 return LEFTSHIFTEQUAL;
1203 }
1204 break;
1205 }
1206 break;
1207 case '>':
1208 switch (c2) {
1209 case '>':
1210 switch (c3) {
1211 case '=':
1212 return RIGHTSHIFTEQUAL;
1213 }
1214 break;
1215 }
1216 break;
1217 case '*':
1218 switch (c2) {
1219 case '*':
1220 switch (c3) {
1221 case '=':
1222 return DOUBLESTAREQUAL;
1223 }
1224 break;
1225 }
1226 break;
1227 case '/':
1228 switch (c2) {
1229 case '/':
1230 switch (c3) {
1231 case '=':
1232 return DOUBLESLASHEQUAL;
1233 }
1234 break;
1235 }
1236 break;
1237 case '.':
1238 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001239 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 switch (c3) {
1241 case '.':
1242 return ELLIPSIS;
1243 }
1244 break;
1245 }
1246 break;
1247 }
1248 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001249}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001250
Guido van Rossum926f13a1998-04-09 21:38:06 +00001251static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001252indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001253{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 if (tok->alterror) {
1255 tok->done = E_TABSPACE;
1256 tok->cur = tok->inp;
1257 return 1;
1258 }
1259 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001260#ifdef PGEN
1261 PySys_WriteStderr("inconsistent use of tabs and spaces "
1262 "in indentation\n");
1263#else
1264 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001266#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 tok->altwarning = 0;
1268 }
1269 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270}
1271
Martin v. Löwis47383402007-08-15 07:32:56 +00001272#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001273#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001274#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275/* Verify that the identifier follows PEP 3131.
1276 All identifier strings are guaranteed to be "ready" unicode objects.
1277 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001278static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001279verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001280{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001281 PyObject *s;
1282 int result;
1283 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1286 PyErr_Clear();
1287 tok->done = E_IDENTIFIER;
1288 } else {
1289 tok->done = E_ERROR;
1290 }
1291 return 0;
1292 }
1293 result = PyUnicode_IsIdentifier(s);
1294 Py_DECREF(s);
1295 if (result == 0)
1296 tok->done = E_IDENTIFIER;
1297 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001298}
1299#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001300
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301/* Get next token, after space stripping etc. */
1302
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001303static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001304tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001306 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001308
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001310 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 tok->start = NULL;
1312 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001313
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 /* Get indentation level */
1315 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001316 int col = 0;
1317 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 tok->atbol = 0;
1319 for (;;) {
1320 c = tok_nextc(tok);
1321 if (c == ' ')
1322 col++, altcol++;
1323 else if (c == '\t') {
1324 col = (col/tok->tabsize + 1) * tok->tabsize;
1325 altcol = (altcol/tok->alttabsize + 1)
1326 * tok->alttabsize;
1327 }
1328 else if (c == '\014') /* Control-L (formfeed) */
1329 col = altcol = 0; /* For Emacs users */
1330 else
1331 break;
1332 }
1333 tok_backup(tok, c);
1334 if (c == '#' || c == '\n') {
1335 /* Lines with only whitespace and/or comments
1336 shouldn't affect the indentation and are
1337 not passed to the parser as NEWLINE tokens,
1338 except *totally* empty lines in interactive
1339 mode, which signal the end of a command group. */
1340 if (col == 0 && c == '\n' && tok->prompt != NULL)
1341 blankline = 0; /* Let it through */
1342 else
1343 blankline = 1; /* Ignore completely */
1344 /* We can't jump back right here since we still
1345 may need to skip to the end of a comment */
1346 }
1347 if (!blankline && tok->level == 0) {
1348 if (col == tok->indstack[tok->indent]) {
1349 /* No change */
1350 if (altcol != tok->altindstack[tok->indent]) {
1351 if (indenterror(tok))
1352 return ERRORTOKEN;
1353 }
1354 }
1355 else if (col > tok->indstack[tok->indent]) {
1356 /* Indent -- always one */
1357 if (tok->indent+1 >= MAXINDENT) {
1358 tok->done = E_TOODEEP;
1359 tok->cur = tok->inp;
1360 return ERRORTOKEN;
1361 }
1362 if (altcol <= tok->altindstack[tok->indent]) {
1363 if (indenterror(tok))
1364 return ERRORTOKEN;
1365 }
1366 tok->pendin++;
1367 tok->indstack[++tok->indent] = col;
1368 tok->altindstack[tok->indent] = altcol;
1369 }
1370 else /* col < tok->indstack[tok->indent] */ {
1371 /* Dedent -- any number, must be consistent */
1372 while (tok->indent > 0 &&
1373 col < tok->indstack[tok->indent]) {
1374 tok->pendin--;
1375 tok->indent--;
1376 }
1377 if (col != tok->indstack[tok->indent]) {
1378 tok->done = E_DEDENT;
1379 tok->cur = tok->inp;
1380 return ERRORTOKEN;
1381 }
1382 if (altcol != tok->altindstack[tok->indent]) {
1383 if (indenterror(tok))
1384 return ERRORTOKEN;
1385 }
1386 }
1387 }
1388 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001391
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 /* Return pending indents/dedents */
1393 if (tok->pendin != 0) {
1394 if (tok->pendin < 0) {
1395 tok->pendin++;
1396 return DEDENT;
1397 }
1398 else {
1399 tok->pendin--;
1400 return INDENT;
1401 }
1402 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001403
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001405 tok->start = NULL;
1406 /* Skip spaces */
1407 do {
1408 c = tok_nextc(tok);
1409 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 /* Set start of current token */
1412 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 /* Skip comment */
1415 if (c == '#')
1416 while (c != EOF && c != '\n')
1417 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419 /* Check for EOF and errors now */
1420 if (c == EOF) {
1421 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1422 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 /* Identifier (most frequent token!) */
1425 nonascii = 0;
1426 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001427 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001428 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001429 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001430 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001431 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001432 /* Since this is a backwards compatibility support literal we don't
1433 want to support it in arbitrary order like byte literals. */
1434 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1435 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001436 /* ur"" and ru"" are not supported */
1437 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001438 saw_r = 1;
1439 else
1440 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 c = tok_nextc(tok);
1442 if (c == '"' || c == '\'')
1443 goto letter_quote;
1444 }
1445 while (is_potential_identifier_char(c)) {
1446 if (c >= 128)
1447 nonascii = 1;
1448 c = tok_nextc(tok);
1449 }
1450 tok_backup(tok, c);
1451 if (nonascii &&
1452 !verify_identifier(tok)) {
1453 tok->done = E_IDENTIFIER;
1454 return ERRORTOKEN;
1455 }
1456 *p_start = tok->start;
1457 *p_end = tok->cur;
1458 return NAME;
1459 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 /* Newline */
1462 if (c == '\n') {
1463 tok->atbol = 1;
1464 if (blankline || tok->level > 0)
1465 goto nextline;
1466 *p_start = tok->start;
1467 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1468 tok->cont_line = 0;
1469 return NEWLINE;
1470 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001471
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 /* Period or number starting with period? */
1473 if (c == '.') {
1474 c = tok_nextc(tok);
1475 if (isdigit(c)) {
1476 goto fraction;
1477 } else if (c == '.') {
1478 c = tok_nextc(tok);
1479 if (c == '.') {
1480 *p_start = tok->start;
1481 *p_end = tok->cur;
1482 return ELLIPSIS;
1483 } else {
1484 tok_backup(tok, c);
1485 }
1486 tok_backup(tok, '.');
1487 } else {
1488 tok_backup(tok, c);
1489 }
1490 *p_start = tok->start;
1491 *p_end = tok->cur;
1492 return DOT;
1493 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 /* Number */
1496 if (isdigit(c)) {
1497 if (c == '0') {
1498 /* Hex, octal or binary -- maybe. */
1499 c = tok_nextc(tok);
1500 if (c == '.')
1501 goto fraction;
1502 if (c == 'j' || c == 'J')
1503 goto imaginary;
1504 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001505
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 /* Hex */
1507 c = tok_nextc(tok);
1508 if (!isxdigit(c)) {
1509 tok->done = E_TOKEN;
1510 tok_backup(tok, c);
1511 return ERRORTOKEN;
1512 }
1513 do {
1514 c = tok_nextc(tok);
1515 } while (isxdigit(c));
1516 }
1517 else if (c == 'o' || c == 'O') {
1518 /* Octal */
1519 c = tok_nextc(tok);
1520 if (c < '0' || c >= '8') {
1521 tok->done = E_TOKEN;
1522 tok_backup(tok, c);
1523 return ERRORTOKEN;
1524 }
1525 do {
1526 c = tok_nextc(tok);
1527 } while ('0' <= c && c < '8');
1528 }
1529 else if (c == 'b' || c == 'B') {
1530 /* Binary */
1531 c = tok_nextc(tok);
1532 if (c != '0' && c != '1') {
1533 tok->done = E_TOKEN;
1534 tok_backup(tok, c);
1535 return ERRORTOKEN;
1536 }
1537 do {
1538 c = tok_nextc(tok);
1539 } while (c == '0' || c == '1');
1540 }
1541 else {
1542 int nonzero = 0;
1543 /* maybe old-style octal; c is first char of it */
1544 /* in any case, allow '0' as a literal */
1545 while (c == '0')
1546 c = tok_nextc(tok);
1547 while (isdigit(c)) {
1548 nonzero = 1;
1549 c = tok_nextc(tok);
1550 }
1551 if (c == '.')
1552 goto fraction;
1553 else if (c == 'e' || c == 'E')
1554 goto exponent;
1555 else if (c == 'j' || c == 'J')
1556 goto imaginary;
1557 else if (nonzero) {
1558 tok->done = E_TOKEN;
1559 tok_backup(tok, c);
1560 return ERRORTOKEN;
1561 }
1562 }
1563 }
1564 else {
1565 /* Decimal */
1566 do {
1567 c = tok_nextc(tok);
1568 } while (isdigit(c));
1569 {
1570 /* Accept floating point numbers. */
1571 if (c == '.') {
1572 fraction:
1573 /* Fraction */
1574 do {
1575 c = tok_nextc(tok);
1576 } while (isdigit(c));
1577 }
1578 if (c == 'e' || c == 'E') {
1579 exponent:
1580 /* Exponent part */
1581 c = tok_nextc(tok);
1582 if (c == '+' || c == '-')
1583 c = tok_nextc(tok);
1584 if (!isdigit(c)) {
1585 tok->done = E_TOKEN;
1586 tok_backup(tok, c);
1587 return ERRORTOKEN;
1588 }
1589 do {
1590 c = tok_nextc(tok);
1591 } while (isdigit(c));
1592 }
1593 if (c == 'j' || c == 'J')
1594 /* Imaginary part */
1595 imaginary:
1596 c = tok_nextc(tok);
1597 }
1598 }
1599 tok_backup(tok, c);
1600 *p_start = tok->start;
1601 *p_end = tok->cur;
1602 return NUMBER;
1603 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001604
1605 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606 /* String */
1607 if (c == '\'' || c == '"') {
1608 int quote = c;
1609 int quote_size = 1; /* 1 or 3 */
1610 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001611
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 /* Find the quote size and start of string */
1613 c = tok_nextc(tok);
1614 if (c == quote) {
1615 c = tok_nextc(tok);
1616 if (c == quote)
1617 quote_size = 3;
1618 else
1619 end_quote_size = 1; /* empty string found */
1620 }
1621 if (c != quote)
1622 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001623
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 /* Get rest of string */
1625 while (end_quote_size != quote_size) {
1626 c = tok_nextc(tok);
1627 if (c == EOF) {
1628 if (quote_size == 3)
1629 tok->done = E_EOFS;
1630 else
1631 tok->done = E_EOLS;
1632 tok->cur = tok->inp;
1633 return ERRORTOKEN;
1634 }
1635 if (quote_size == 1 && c == '\n') {
1636 tok->done = E_EOLS;
1637 tok->cur = tok->inp;
1638 return ERRORTOKEN;
1639 }
1640 if (c == quote)
1641 end_quote_size += 1;
1642 else {
1643 end_quote_size = 0;
1644 if (c == '\\')
1645 c = tok_nextc(tok); /* skip escaped char */
1646 }
1647 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001648
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001649 *p_start = tok->start;
1650 *p_end = tok->cur;
1651 return STRING;
1652 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Line continuation */
1655 if (c == '\\') {
1656 c = tok_nextc(tok);
1657 if (c != '\n') {
1658 tok->done = E_LINECONT;
1659 tok->cur = tok->inp;
1660 return ERRORTOKEN;
1661 }
1662 tok->cont_line = 1;
1663 goto again; /* Read next line */
1664 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001665
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001666 /* Check for two-character token */
1667 {
1668 int c2 = tok_nextc(tok);
1669 int token = PyToken_TwoChars(c, c2);
1670 if (token != OP) {
1671 int c3 = tok_nextc(tok);
1672 int token3 = PyToken_ThreeChars(c, c2, c3);
1673 if (token3 != OP) {
1674 token = token3;
1675 } else {
1676 tok_backup(tok, c3);
1677 }
1678 *p_start = tok->start;
1679 *p_end = tok->cur;
1680 return token;
1681 }
1682 tok_backup(tok, c2);
1683 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001684
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 /* Keep track of parentheses nesting level */
1686 switch (c) {
1687 case '(':
1688 case '[':
1689 case '{':
1690 tok->level++;
1691 break;
1692 case ')':
1693 case ']':
1694 case '}':
1695 tok->level--;
1696 break;
1697 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001698
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 /* Punctuation character */
1700 *p_start = tok->start;
1701 *p_end = tok->cur;
1702 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001703}
1704
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001705int
1706PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1707{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 int result = tok_get(tok, p_start, p_end);
1709 if (tok->decoding_erred) {
1710 result = ERRORTOKEN;
1711 tok->done = E_DECODE;
1712 }
1713 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001714}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001715
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001716/* Get the encoding of a Python file. Check for the coding cookie and check if
1717 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001718
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001719 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1720 encoding in the first or second line of the file (in which case the encoding
1721 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001722
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001723 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1724 by the caller. */
1725
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001726char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001727PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001728{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 struct tok_state *tok;
1730 FILE *fp;
1731 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001732
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 fd = dup(fd);
1734 if (fd < 0) {
1735 return NULL;
1736 }
1737 fp = fdopen(fd, "r");
1738 if (fp == NULL) {
1739 return NULL;
1740 }
1741 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1742 if (tok == NULL) {
1743 fclose(fp);
1744 return NULL;
1745 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001746#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001747 if (filename != NULL) {
1748 Py_INCREF(filename);
1749 tok->filename = filename;
1750 }
1751 else {
1752 tok->filename = PyUnicode_FromString("<string>");
1753 if (tok->filename == NULL) {
1754 fclose(fp);
1755 PyTokenizer_Free(tok);
1756 return encoding;
1757 }
1758 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001759#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001760 while (tok->lineno < 2 && tok->done == E_OK) {
1761 PyTokenizer_Get(tok, &p_start, &p_end);
1762 }
1763 fclose(fp);
1764 if (tok->encoding) {
1765 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1766 if (encoding)
1767 strcpy(encoding, tok->encoding);
1768 }
1769 PyTokenizer_Free(tok);
1770 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001771}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001772
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001773char *
1774PyTokenizer_FindEncoding(int fd)
1775{
1776 return PyTokenizer_FindEncodingFilename(fd, NULL);
1777}
1778
Guido van Rossum408027e1996-12-30 16:17:54 +00001779#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001780
1781void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001782tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001783{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001784 printf("%s", _PyParser_TokenNames[type]);
1785 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1786 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001787}
1788
1789#endif