blob: 5c0bd6eb8a12a2aec78434db38d7a3d655984622 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 if (tok->cont_line)
287 /* It's a continuation line, so it can't be a coding spec. */
288 return 1;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700289 if (!get_coding_spec(line, &cs, size, tok))
290 return 0;
291 if (!cs)
292 return 1;
293 tok->read_coding_spec = 1;
294 if (tok->encoding == NULL) {
295 assert(tok->decoding_state == STATE_RAW);
296 if (strcmp(cs, "utf-8") == 0) {
297 tok->encoding = cs;
298 } else {
299 r = set_readline(tok, cs);
300 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700302 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700304 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300305 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700306 "encoding problem: %s", cs);
307 PyMem_FREE(cs);
308 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000309 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700310 } else { /* then, compare cs with BOM */
311 r = (strcmp(tok->encoding, cs) == 0);
312 if (!r)
313 PyErr_Format(PyExc_SyntaxError,
314 "encoding problem: %s with BOM", cs);
315 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318}
319
320/* See whether the file starts with a BOM. If it does,
321 invoke the set_readline function with the new encoding.
322 Return 1 on success, 0 on failure. */
323
324static int
325check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 void unget_char(int, struct tok_state *),
327 int set_readline(struct tok_state *, const char *),
328 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000329{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 int ch1, ch2, ch3;
331 ch1 = get_char(tok);
332 tok->decoding_state = STATE_RAW;
333 if (ch1 == EOF) {
334 return 1;
335 } else if (ch1 == 0xEF) {
336 ch2 = get_char(tok);
337 if (ch2 != 0xBB) {
338 unget_char(ch2, tok);
339 unget_char(ch1, tok);
340 return 1;
341 }
342 ch3 = get_char(tok);
343 if (ch3 != 0xBF) {
344 unget_char(ch3, tok);
345 unget_char(ch2, tok);
346 unget_char(ch1, tok);
347 return 1;
348 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 /* Disable support for UTF-16 BOMs until a decision
351 is made whether this needs to be supported. */
352 } else if (ch1 == 0xFE) {
353 ch2 = get_char(tok);
354 if (ch2 != 0xFF) {
355 unget_char(ch2, tok);
356 unget_char(ch1, tok);
357 return 1;
358 }
359 if (!set_readline(tok, "utf-16-be"))
360 return 0;
361 tok->decoding_state = STATE_NORMAL;
362 } else if (ch1 == 0xFF) {
363 ch2 = get_char(tok);
364 if (ch2 != 0xFE) {
365 unget_char(ch2, tok);
366 unget_char(ch1, tok);
367 return 1;
368 }
369 if (!set_readline(tok, "utf-16-le"))
370 return 0;
371 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 } else {
374 unget_char(ch1, tok);
375 return 1;
376 }
377 if (tok->encoding != NULL)
378 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700379 tok->encoding = new_string("utf-8", 5, tok);
380 if (!tok->encoding)
381 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 /* No need to set_readline: input is already utf-8 */
383 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384}
385
386/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 On entry, tok->decoding_buffer will be one of:
390 1) NULL: need to call tok->decoding_readline to get a new line
391 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000393 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 (in the s buffer) to copy entire contents of the line read
395 by tok->decoding_readline. tok->decoding_buffer has the overflow.
396 In this case, fp_readl is called in a loop (with an expanded buffer)
397 until the buffer ends with a '\n' (or until the end of the file is
398 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400
401static char *
402fp_readl(char *s, int size, struct tok_state *tok)
403{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyObject* bufobj;
405 const char *buf;
406 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 /* Ask for one less byte so we can terminate it */
409 assert(size > 0);
410 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000412 if (tok->decoding_buffer) {
413 bufobj = tok->decoding_buffer;
414 Py_INCREF(bufobj);
415 }
416 else
417 {
418 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (bufobj == NULL)
420 goto error;
421 }
422 if (PyUnicode_CheckExact(bufobj))
423 {
424 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
425 if (buf == NULL) {
426 goto error;
427 }
428 }
429 else
430 {
431 buf = PyByteArray_AsString(bufobj);
432 if (buf == NULL) {
433 goto error;
434 }
435 buflen = PyByteArray_GET_SIZE(bufobj);
436 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438 Py_XDECREF(tok->decoding_buffer);
439 if (buflen > size) {
440 /* Too many chars, the rest goes into tok->decoding_buffer */
441 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
442 buflen-size);
443 if (tok->decoding_buffer == NULL)
444 goto error;
445 buflen = size;
446 }
447 else
448 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000449
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000450 memcpy(s, buf, buflen);
451 s[buflen] = '\0';
452 if (buflen == 0) /* EOF */
453 s = NULL;
454 Py_DECREF(bufobj);
455 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000456
457error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 Py_XDECREF(bufobj);
459 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460}
461
462/* Set the readline function for TOK to a StreamReader's
463 readline function. The StreamReader is named ENC.
464
465 This function is called from check_bom and check_coding_spec.
466
467 ENC is usually identical to the future value of tok->encoding,
468 except for the (currently unsupported) case of UTF-16.
469
470 Return 1 on success, 0 on failure. */
471
472static int
473fp_setreadl(struct tok_state *tok, const char* enc)
474{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200476 _Py_IDENTIFIER(open);
477 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000478 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 io = PyImport_ImportModuleNoBlock("io");
481 if (io == NULL)
482 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000483
Victor Stinner22a351a2010-10-14 12:04:34 +0000484 fd = fileno(tok->fp);
485 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
486 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
487 goto cleanup;
488 }
489
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200490 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000491 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 if (stream == NULL)
493 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200496 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 /* The file has been reopened; parsing will restart from
500 * the beginning of the file, we have to reset the line number.
501 * But this function has been called from inside tok_nextc() which
502 * will increment lineno before it returns. So we set it -1 so that
503 * the next call to tok_nextc() will start with tok->lineno == 0.
504 */
505 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000506
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000507 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 Py_XDECREF(stream);
509 Py_XDECREF(io);
510 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000511}
512
513/* Fetch the next byte from TOK. */
514
515static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000516 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517}
518
519/* Unfetch the last byte back into TOK. */
520
521static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523}
524
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000525/* Check whether the characters at s start a valid
526 UTF-8 sequence. Return the number of characters forming
527 the sequence if yes, 0 if not. */
528static int valid_utf8(const unsigned char* s)
529{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 int expected = 0;
531 int length;
532 if (*s < 0x80)
533 /* single-byte code */
534 return 1;
535 if (*s < 0xc0)
536 /* following byte */
537 return 0;
538 if (*s < 0xE0)
539 expected = 1;
540 else if (*s < 0xF0)
541 expected = 2;
542 else if (*s < 0xF8)
543 expected = 3;
544 else
545 return 0;
546 length = expected + 1;
547 for (; expected; expected--)
548 if (s[expected] < 0x80 || s[expected] >= 0xC0)
549 return 0;
550 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000551}
552
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553/* Read a line of input from TOK. Determine encoding
554 if necessary. */
555
556static char *
557decoding_fgets(char *s, int size, struct tok_state *tok)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 char *line = NULL;
560 int badchar = 0;
561 for (;;) {
562 if (tok->decoding_state == STATE_NORMAL) {
563 /* We already have a codec associated with
564 this input. */
565 line = fp_readl(s, size, tok);
566 break;
567 } else if (tok->decoding_state == STATE_RAW) {
568 /* We want a 'raw' read. */
569 line = Py_UniversalNewlineFgets(s, size,
570 tok->fp, NULL);
571 break;
572 } else {
573 /* We have not yet determined the encoding.
574 If an encoding is found, use the file-pointer
575 reader functions from now on. */
576 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
577 return error_ret(tok);
578 assert(tok->decoding_state != STATE_INIT);
579 }
580 }
581 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
582 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
583 return error_ret(tok);
584 }
585 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 /* The default encoding is UTF-8, so make sure we don't have any
588 non-UTF-8 sequences in it. */
589 if (line && !tok->encoding) {
590 unsigned char *c;
591 int length;
592 for (c = (unsigned char *)line; *c; c += length)
593 if (!(length = valid_utf8(c))) {
594 badchar = *c;
595 break;
596 }
597 }
598 if (badchar) {
599 /* Need to add 1 to the line number, since this line
600 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200601 PyErr_Format(PyExc_SyntaxError,
602 "Non-UTF-8 code starting with '\\x%.2x' "
603 "in file %U on line %i, "
604 "but no encoding declared; "
605 "see http://python.org/dev/peps/pep-0263/ for details",
606 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 return error_ret(tok);
608 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611}
612
613static int
614decoding_feof(struct tok_state *tok)
615{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 if (tok->decoding_state != STATE_NORMAL) {
617 return feof(tok->fp);
618 } else {
619 PyObject* buf = tok->decoding_buffer;
620 if (buf == NULL) {
621 buf = PyObject_CallObject(tok->decoding_readline, NULL);
622 if (buf == NULL) {
623 error_ret(tok);
624 return 1;
625 } else {
626 tok->decoding_buffer = buf;
627 }
628 }
629 return PyObject_Length(buf) == 0;
630 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633/* Fetch a byte from TOK, using the string buffer. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638}
639
640/* Unfetch a byte from TOK, using the string buffer. */
641
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000642static void
643buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 tok->str--;
645 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646}
647
648/* Set the readline function for TOK to ENC. For the string-based
649 tokenizer, this means to just record the encoding. */
650
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000651static int
652buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 tok->enc = enc;
654 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655}
656
657/* Return a UTF-8 encoding Python string object from the
658 C byte string STR, which is encoded with ENC. */
659
660static PyObject *
661translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662 PyObject *utf8;
663 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
664 if (buf == NULL)
665 return NULL;
666 utf8 = PyUnicode_AsUTF8String(buf);
667 Py_DECREF(buf);
668 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669}
670
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000671
672static char *
673translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
675 char *buf, *current;
676 char c = '\0';
677 buf = PyMem_MALLOC(needed_length);
678 if (buf == NULL) {
679 tok->done = E_NOMEM;
680 return NULL;
681 }
682 for (current = buf; *s; s++, current++) {
683 c = *s;
684 if (skip_next_lf) {
685 skip_next_lf = 0;
686 if (c == '\n') {
687 c = *++s;
688 if (!c)
689 break;
690 }
691 }
692 if (c == '\r') {
693 skip_next_lf = 1;
694 c = '\n';
695 }
696 *current = c;
697 }
698 /* If this is exec input, add a newline to the end of the string if
699 there isn't one already. */
700 if (exec_input && c != '\n') {
701 *current = '\n';
702 current++;
703 }
704 *current = '\0';
705 final_length = current - buf + 1;
706 if (final_length < needed_length && final_length)
707 /* should never fail */
708 buf = PyMem_REALLOC(buf, final_length);
709 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000710}
711
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000712/* Decode a byte string STR for use as the buffer of TOK.
713 Look for encoding declarations inside STR, and record them
714 inside TOK. */
715
716static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000717decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000718{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 PyObject* utf8 = NULL;
720 const char *str;
721 const char *s;
722 const char *newl[2] = {NULL, NULL};
723 int lineno = 0;
724 tok->input = str = translate_newlines(input, single, tok);
725 if (str == NULL)
726 return NULL;
727 tok->enc = NULL;
728 tok->str = str;
729 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
730 return error_ret(tok);
731 str = tok->str; /* string after BOM if any */
732 assert(str);
733 if (tok->enc != NULL) {
734 utf8 = translate_into_utf8(str, tok->enc);
735 if (utf8 == NULL)
736 return error_ret(tok);
737 str = PyBytes_AsString(utf8);
738 }
739 for (s = str;; s++) {
740 if (*s == '\0') break;
741 else if (*s == '\n') {
742 assert(lineno < 2);
743 newl[lineno] = s;
744 lineno++;
745 if (lineno == 2) break;
746 }
747 }
748 tok->enc = NULL;
749 /* need to check line 1 and 2 separately since check_coding_spec
750 assumes a single line as input */
751 if (newl[0]) {
752 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
753 return error_ret(tok);
754 if (tok->enc == NULL && newl[1]) {
755 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
756 tok, buf_setreadl))
757 return error_ret(tok);
758 }
759 }
760 if (tok->enc != NULL) {
761 assert(utf8 == NULL);
762 utf8 = translate_into_utf8(str, tok->enc);
763 if (utf8 == NULL)
764 return error_ret(tok);
765 str = PyBytes_AS_STRING(utf8);
766 }
767 assert(tok->decoding_buffer == NULL);
768 tok->decoding_buffer = utf8; /* CAUTION */
769 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000770}
771
772#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773
774/* Set up tokenizer for string */
775
776struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000777PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 struct tok_state *tok = tok_new();
780 if (tok == NULL)
781 return NULL;
782 str = (char *)decode_str(str, exec_input, tok);
783 if (str == NULL) {
784 PyTokenizer_Free(tok);
785 return NULL;
786 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000787
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 /* XXX: constify members. */
789 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
790 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791}
792
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000793struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000794PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000795{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 struct tok_state *tok = tok_new();
797 if (tok == NULL)
798 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000799#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000801#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 if (str == NULL) {
803 PyTokenizer_Free(tok);
804 return NULL;
805 }
806 tok->decoding_state = STATE_RAW;
807 tok->read_coding_spec = 1;
808 tok->enc = NULL;
809 tok->str = str;
810 tok->encoding = (char *)PyMem_MALLOC(6);
811 if (!tok->encoding) {
812 PyTokenizer_Free(tok);
813 return NULL;
814 }
815 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000816
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 /* XXX: constify members. */
818 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
819 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000820}
821
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000822/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823
824struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000825PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 struct tok_state *tok = tok_new();
828 if (tok == NULL)
829 return NULL;
830 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
831 PyTokenizer_Free(tok);
832 return NULL;
833 }
834 tok->cur = tok->inp = tok->buf;
835 tok->end = tok->buf + BUFSIZ;
836 tok->fp = fp;
837 tok->prompt = ps1;
838 tok->nextprompt = ps2;
839 if (enc != NULL) {
840 /* Must copy encoding declaration since it
841 gets copied into the parse tree. */
842 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
843 if (!tok->encoding) {
844 PyTokenizer_Free(tok);
845 return NULL;
846 }
847 strcpy(tok->encoding, enc);
848 tok->decoding_state = STATE_NORMAL;
849 }
850 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000851}
852
853
854/* Free a tok_state structure */
855
856void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000857PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000859 if (tok->encoding != NULL)
860 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000861#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000862 Py_XDECREF(tok->decoding_readline);
863 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200864 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000865#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000866 if (tok->fp != NULL && tok->buf != NULL)
867 PyMem_FREE(tok->buf);
868 if (tok->input)
869 PyMem_FREE((char *)tok->input);
870 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000871}
872
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873/* Get next char, updating state; error code goes into tok->done */
874
875static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000876tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000877{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000878 for (;;) {
879 if (tok->cur != tok->inp) {
880 return Py_CHARMASK(*tok->cur++); /* Fast path */
881 }
882 if (tok->done != E_OK)
883 return EOF;
884 if (tok->fp == NULL) {
885 char *end = strchr(tok->inp, '\n');
886 if (end != NULL)
887 end++;
888 else {
889 end = strchr(tok->inp, '\0');
890 if (end == tok->inp) {
891 tok->done = E_EOF;
892 return EOF;
893 }
894 }
895 if (tok->start == NULL)
896 tok->buf = tok->cur;
897 tok->line_start = tok->cur;
898 tok->lineno++;
899 tok->inp = end;
900 return Py_CHARMASK(*tok->cur++);
901 }
902 if (tok->prompt != NULL) {
903 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000904#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000905 if (newtok != NULL) {
906 char *translated = translate_newlines(newtok, 0, tok);
907 PyMem_FREE(newtok);
908 if (translated == NULL)
909 return EOF;
910 newtok = translated;
911 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000912 if (tok->encoding && newtok && *newtok) {
913 /* Recode to UTF-8 */
914 Py_ssize_t buflen;
915 const char* buf;
916 PyObject *u = translate_into_utf8(newtok, tok->encoding);
917 PyMem_FREE(newtok);
918 if (!u) {
919 tok->done = E_DECODE;
920 return EOF;
921 }
922 buflen = PyBytes_GET_SIZE(u);
923 buf = PyBytes_AS_STRING(u);
924 if (!buf) {
925 Py_DECREF(u);
926 tok->done = E_DECODE;
927 return EOF;
928 }
929 newtok = PyMem_MALLOC(buflen+1);
930 strcpy(newtok, buf);
931 Py_DECREF(u);
932 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000933#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000934 if (tok->nextprompt != NULL)
935 tok->prompt = tok->nextprompt;
936 if (newtok == NULL)
937 tok->done = E_INTR;
938 else if (*newtok == '\0') {
939 PyMem_FREE(newtok);
940 tok->done = E_EOF;
941 }
942 else if (tok->start != NULL) {
943 size_t start = tok->start - tok->buf;
944 size_t oldlen = tok->cur - tok->buf;
945 size_t newlen = oldlen + strlen(newtok);
946 char *buf = tok->buf;
947 buf = (char *)PyMem_REALLOC(buf, newlen+1);
948 tok->lineno++;
949 if (buf == NULL) {
950 PyMem_FREE(tok->buf);
951 tok->buf = NULL;
952 PyMem_FREE(newtok);
953 tok->done = E_NOMEM;
954 return EOF;
955 }
956 tok->buf = buf;
957 tok->cur = tok->buf + oldlen;
958 tok->line_start = tok->cur;
959 strcpy(tok->buf + oldlen, newtok);
960 PyMem_FREE(newtok);
961 tok->inp = tok->buf + newlen;
962 tok->end = tok->inp + 1;
963 tok->start = tok->buf + start;
964 }
965 else {
966 tok->lineno++;
967 if (tok->buf != NULL)
968 PyMem_FREE(tok->buf);
969 tok->buf = newtok;
970 tok->line_start = tok->buf;
971 tok->cur = tok->buf;
972 tok->line_start = tok->buf;
973 tok->inp = strchr(tok->buf, '\0');
974 tok->end = tok->inp + 1;
975 }
976 }
977 else {
978 int done = 0;
979 Py_ssize_t cur = 0;
980 char *pt;
981 if (tok->start == NULL) {
982 if (tok->buf == NULL) {
983 tok->buf = (char *)
984 PyMem_MALLOC(BUFSIZ);
985 if (tok->buf == NULL) {
986 tok->done = E_NOMEM;
987 return EOF;
988 }
989 tok->end = tok->buf + BUFSIZ;
990 }
991 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
992 tok) == NULL) {
993 tok->done = E_EOF;
994 done = 1;
995 }
996 else {
997 tok->done = E_OK;
998 tok->inp = strchr(tok->buf, '\0');
999 done = tok->inp[-1] == '\n';
1000 }
1001 }
1002 else {
1003 cur = tok->cur - tok->buf;
1004 if (decoding_feof(tok)) {
1005 tok->done = E_EOF;
1006 done = 1;
1007 }
1008 else
1009 tok->done = E_OK;
1010 }
1011 tok->lineno++;
1012 /* Read until '\n' or EOF */
1013 while (!done) {
1014 Py_ssize_t curstart = tok->start == NULL ? -1 :
1015 tok->start - tok->buf;
1016 Py_ssize_t curvalid = tok->inp - tok->buf;
1017 Py_ssize_t newsize = curvalid + BUFSIZ;
1018 char *newbuf = tok->buf;
1019 newbuf = (char *)PyMem_REALLOC(newbuf,
1020 newsize);
1021 if (newbuf == NULL) {
1022 tok->done = E_NOMEM;
1023 tok->cur = tok->inp;
1024 return EOF;
1025 }
1026 tok->buf = newbuf;
1027 tok->inp = tok->buf + curvalid;
1028 tok->end = tok->buf + newsize;
1029 tok->start = curstart < 0 ? NULL :
1030 tok->buf + curstart;
1031 if (decoding_fgets(tok->inp,
1032 (int)(tok->end - tok->inp),
1033 tok) == NULL) {
1034 /* Break out early on decoding
1035 errors, as tok->buf will be NULL
1036 */
1037 if (tok->decoding_erred)
1038 return EOF;
1039 /* Last line does not end in \n,
1040 fake one */
1041 strcpy(tok->inp, "\n");
1042 }
1043 tok->inp = strchr(tok->inp, '\0');
1044 done = tok->inp[-1] == '\n';
1045 }
1046 if (tok->buf != NULL) {
1047 tok->cur = tok->buf + cur;
1048 tok->line_start = tok->cur;
1049 /* replace "\r\n" with "\n" */
1050 /* For Mac leave the \r, giving a syntax error */
1051 pt = tok->inp - 2;
1052 if (pt >= tok->buf && *pt == '\r') {
1053 *pt++ = '\n';
1054 *pt = '\0';
1055 tok->inp = pt;
1056 }
1057 }
1058 }
1059 if (tok->done != E_OK) {
1060 if (tok->prompt != NULL)
1061 PySys_WriteStderr("\n");
1062 tok->cur = tok->inp;
1063 return EOF;
1064 }
1065 }
1066 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001067}
1068
1069
1070/* Back-up one character */
1071
1072static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001073tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001075 if (c != EOF) {
1076 if (--tok->cur < tok->buf)
1077 Py_FatalError("tok_backup: beginning of buffer");
1078 if (*tok->cur != c)
1079 *tok->cur = c;
1080 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001081}
1082
1083
1084/* Return the token corresponding to a single character */
1085
1086int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001087PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001088{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 switch (c) {
1090 case '(': return LPAR;
1091 case ')': return RPAR;
1092 case '[': return LSQB;
1093 case ']': return RSQB;
1094 case ':': return COLON;
1095 case ',': return COMMA;
1096 case ';': return SEMI;
1097 case '+': return PLUS;
1098 case '-': return MINUS;
1099 case '*': return STAR;
1100 case '/': return SLASH;
1101 case '|': return VBAR;
1102 case '&': return AMPER;
1103 case '<': return LESS;
1104 case '>': return GREATER;
1105 case '=': return EQUAL;
1106 case '.': return DOT;
1107 case '%': return PERCENT;
1108 case '{': return LBRACE;
1109 case '}': return RBRACE;
1110 case '^': return CIRCUMFLEX;
1111 case '~': return TILDE;
1112 case '@': return AT;
1113 default: return OP;
1114 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001115}
1116
1117
Guido van Rossumfbab9051991-10-20 20:25:03 +00001118int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001119PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001120{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 switch (c1) {
1122 case '=':
1123 switch (c2) {
1124 case '=': return EQEQUAL;
1125 }
1126 break;
1127 case '!':
1128 switch (c2) {
1129 case '=': return NOTEQUAL;
1130 }
1131 break;
1132 case '<':
1133 switch (c2) {
1134 case '>': return NOTEQUAL;
1135 case '=': return LESSEQUAL;
1136 case '<': return LEFTSHIFT;
1137 }
1138 break;
1139 case '>':
1140 switch (c2) {
1141 case '=': return GREATEREQUAL;
1142 case '>': return RIGHTSHIFT;
1143 }
1144 break;
1145 case '+':
1146 switch (c2) {
1147 case '=': return PLUSEQUAL;
1148 }
1149 break;
1150 case '-':
1151 switch (c2) {
1152 case '=': return MINEQUAL;
1153 case '>': return RARROW;
1154 }
1155 break;
1156 case '*':
1157 switch (c2) {
1158 case '*': return DOUBLESTAR;
1159 case '=': return STAREQUAL;
1160 }
1161 break;
1162 case '/':
1163 switch (c2) {
1164 case '/': return DOUBLESLASH;
1165 case '=': return SLASHEQUAL;
1166 }
1167 break;
1168 case '|':
1169 switch (c2) {
1170 case '=': return VBAREQUAL;
1171 }
1172 break;
1173 case '%':
1174 switch (c2) {
1175 case '=': return PERCENTEQUAL;
1176 }
1177 break;
1178 case '&':
1179 switch (c2) {
1180 case '=': return AMPEREQUAL;
1181 }
1182 break;
1183 case '^':
1184 switch (c2) {
1185 case '=': return CIRCUMFLEXEQUAL;
1186 }
1187 break;
1188 }
1189 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001190}
1191
Thomas Wouters434d0822000-08-24 20:11:32 +00001192int
1193PyToken_ThreeChars(int c1, int c2, int c3)
1194{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 switch (c1) {
1196 case '<':
1197 switch (c2) {
1198 case '<':
1199 switch (c3) {
1200 case '=':
1201 return LEFTSHIFTEQUAL;
1202 }
1203 break;
1204 }
1205 break;
1206 case '>':
1207 switch (c2) {
1208 case '>':
1209 switch (c3) {
1210 case '=':
1211 return RIGHTSHIFTEQUAL;
1212 }
1213 break;
1214 }
1215 break;
1216 case '*':
1217 switch (c2) {
1218 case '*':
1219 switch (c3) {
1220 case '=':
1221 return DOUBLESTAREQUAL;
1222 }
1223 break;
1224 }
1225 break;
1226 case '/':
1227 switch (c2) {
1228 case '/':
1229 switch (c3) {
1230 case '=':
1231 return DOUBLESLASHEQUAL;
1232 }
1233 break;
1234 }
1235 break;
1236 case '.':
1237 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001238 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 switch (c3) {
1240 case '.':
1241 return ELLIPSIS;
1242 }
1243 break;
1244 }
1245 break;
1246 }
1247 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001248}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001249
Guido van Rossum926f13a1998-04-09 21:38:06 +00001250static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001251indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001252{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 if (tok->alterror) {
1254 tok->done = E_TABSPACE;
1255 tok->cur = tok->inp;
1256 return 1;
1257 }
1258 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001259#ifdef PGEN
1260 PySys_WriteStderr("inconsistent use of tabs and spaces "
1261 "in indentation\n");
1262#else
1263 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001265#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 tok->altwarning = 0;
1267 }
1268 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001269}
1270
Martin v. Löwis47383402007-08-15 07:32:56 +00001271#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001272#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001273#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001274/* Verify that the identifier follows PEP 3131.
1275 All identifier strings are guaranteed to be "ready" unicode objects.
1276 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001277static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001278verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001279{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 PyObject *s;
1281 int result;
1282 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1285 PyErr_Clear();
1286 tok->done = E_IDENTIFIER;
1287 } else {
1288 tok->done = E_ERROR;
1289 }
1290 return 0;
1291 }
1292 result = PyUnicode_IsIdentifier(s);
1293 Py_DECREF(s);
1294 if (result == 0)
1295 tok->done = E_IDENTIFIER;
1296 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001297}
1298#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001299
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300/* Get next token, after space stripping etc. */
1301
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001302static int
1303tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 register int c;
1306 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001307
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001309 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 tok->start = NULL;
1311 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001312
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 /* Get indentation level */
1314 if (tok->atbol) {
1315 register int col = 0;
1316 register int altcol = 0;
1317 tok->atbol = 0;
1318 for (;;) {
1319 c = tok_nextc(tok);
1320 if (c == ' ')
1321 col++, altcol++;
1322 else if (c == '\t') {
1323 col = (col/tok->tabsize + 1) * tok->tabsize;
1324 altcol = (altcol/tok->alttabsize + 1)
1325 * tok->alttabsize;
1326 }
1327 else if (c == '\014') /* Control-L (formfeed) */
1328 col = altcol = 0; /* For Emacs users */
1329 else
1330 break;
1331 }
1332 tok_backup(tok, c);
1333 if (c == '#' || c == '\n') {
1334 /* Lines with only whitespace and/or comments
1335 shouldn't affect the indentation and are
1336 not passed to the parser as NEWLINE tokens,
1337 except *totally* empty lines in interactive
1338 mode, which signal the end of a command group. */
1339 if (col == 0 && c == '\n' && tok->prompt != NULL)
1340 blankline = 0; /* Let it through */
1341 else
1342 blankline = 1; /* Ignore completely */
1343 /* We can't jump back right here since we still
1344 may need to skip to the end of a comment */
1345 }
1346 if (!blankline && tok->level == 0) {
1347 if (col == tok->indstack[tok->indent]) {
1348 /* No change */
1349 if (altcol != tok->altindstack[tok->indent]) {
1350 if (indenterror(tok))
1351 return ERRORTOKEN;
1352 }
1353 }
1354 else if (col > tok->indstack[tok->indent]) {
1355 /* Indent -- always one */
1356 if (tok->indent+1 >= MAXINDENT) {
1357 tok->done = E_TOODEEP;
1358 tok->cur = tok->inp;
1359 return ERRORTOKEN;
1360 }
1361 if (altcol <= tok->altindstack[tok->indent]) {
1362 if (indenterror(tok))
1363 return ERRORTOKEN;
1364 }
1365 tok->pendin++;
1366 tok->indstack[++tok->indent] = col;
1367 tok->altindstack[tok->indent] = altcol;
1368 }
1369 else /* col < tok->indstack[tok->indent] */ {
1370 /* Dedent -- any number, must be consistent */
1371 while (tok->indent > 0 &&
1372 col < tok->indstack[tok->indent]) {
1373 tok->pendin--;
1374 tok->indent--;
1375 }
1376 if (col != tok->indstack[tok->indent]) {
1377 tok->done = E_DEDENT;
1378 tok->cur = tok->inp;
1379 return ERRORTOKEN;
1380 }
1381 if (altcol != tok->altindstack[tok->indent]) {
1382 if (indenterror(tok))
1383 return ERRORTOKEN;
1384 }
1385 }
1386 }
1387 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001388
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 /* Return pending indents/dedents */
1392 if (tok->pendin != 0) {
1393 if (tok->pendin < 0) {
1394 tok->pendin++;
1395 return DEDENT;
1396 }
1397 else {
1398 tok->pendin--;
1399 return INDENT;
1400 }
1401 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 tok->start = NULL;
1405 /* Skip spaces */
1406 do {
1407 c = tok_nextc(tok);
1408 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 /* Set start of current token */
1411 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 /* Skip comment */
1414 if (c == '#')
1415 while (c != EOF && c != '\n')
1416 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 /* Check for EOF and errors now */
1419 if (c == EOF) {
1420 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1421 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 /* Identifier (most frequent token!) */
1424 nonascii = 0;
1425 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001426 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001427 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001428 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001429 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001430 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001431 /* Since this is a backwards compatibility support literal we don't
1432 want to support it in arbitrary order like byte literals. */
1433 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1434 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001435 /* ur"" and ru"" are not supported */
1436 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001437 saw_r = 1;
1438 else
1439 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 c = tok_nextc(tok);
1441 if (c == '"' || c == '\'')
1442 goto letter_quote;
1443 }
1444 while (is_potential_identifier_char(c)) {
1445 if (c >= 128)
1446 nonascii = 1;
1447 c = tok_nextc(tok);
1448 }
1449 tok_backup(tok, c);
1450 if (nonascii &&
1451 !verify_identifier(tok)) {
1452 tok->done = E_IDENTIFIER;
1453 return ERRORTOKEN;
1454 }
1455 *p_start = tok->start;
1456 *p_end = tok->cur;
1457 return NAME;
1458 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 /* Newline */
1461 if (c == '\n') {
1462 tok->atbol = 1;
1463 if (blankline || tok->level > 0)
1464 goto nextline;
1465 *p_start = tok->start;
1466 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1467 tok->cont_line = 0;
1468 return NEWLINE;
1469 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 /* Period or number starting with period? */
1472 if (c == '.') {
1473 c = tok_nextc(tok);
1474 if (isdigit(c)) {
1475 goto fraction;
1476 } else if (c == '.') {
1477 c = tok_nextc(tok);
1478 if (c == '.') {
1479 *p_start = tok->start;
1480 *p_end = tok->cur;
1481 return ELLIPSIS;
1482 } else {
1483 tok_backup(tok, c);
1484 }
1485 tok_backup(tok, '.');
1486 } else {
1487 tok_backup(tok, c);
1488 }
1489 *p_start = tok->start;
1490 *p_end = tok->cur;
1491 return DOT;
1492 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001493
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001494 /* Number */
1495 if (isdigit(c)) {
1496 if (c == '0') {
1497 /* Hex, octal or binary -- maybe. */
1498 c = tok_nextc(tok);
1499 if (c == '.')
1500 goto fraction;
1501 if (c == 'j' || c == 'J')
1502 goto imaginary;
1503 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001504
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 /* Hex */
1506 c = tok_nextc(tok);
1507 if (!isxdigit(c)) {
1508 tok->done = E_TOKEN;
1509 tok_backup(tok, c);
1510 return ERRORTOKEN;
1511 }
1512 do {
1513 c = tok_nextc(tok);
1514 } while (isxdigit(c));
1515 }
1516 else if (c == 'o' || c == 'O') {
1517 /* Octal */
1518 c = tok_nextc(tok);
1519 if (c < '0' || c >= '8') {
1520 tok->done = E_TOKEN;
1521 tok_backup(tok, c);
1522 return ERRORTOKEN;
1523 }
1524 do {
1525 c = tok_nextc(tok);
1526 } while ('0' <= c && c < '8');
1527 }
1528 else if (c == 'b' || c == 'B') {
1529 /* Binary */
1530 c = tok_nextc(tok);
1531 if (c != '0' && c != '1') {
1532 tok->done = E_TOKEN;
1533 tok_backup(tok, c);
1534 return ERRORTOKEN;
1535 }
1536 do {
1537 c = tok_nextc(tok);
1538 } while (c == '0' || c == '1');
1539 }
1540 else {
1541 int nonzero = 0;
1542 /* maybe old-style octal; c is first char of it */
1543 /* in any case, allow '0' as a literal */
1544 while (c == '0')
1545 c = tok_nextc(tok);
1546 while (isdigit(c)) {
1547 nonzero = 1;
1548 c = tok_nextc(tok);
1549 }
1550 if (c == '.')
1551 goto fraction;
1552 else if (c == 'e' || c == 'E')
1553 goto exponent;
1554 else if (c == 'j' || c == 'J')
1555 goto imaginary;
1556 else if (nonzero) {
1557 tok->done = E_TOKEN;
1558 tok_backup(tok, c);
1559 return ERRORTOKEN;
1560 }
1561 }
1562 }
1563 else {
1564 /* Decimal */
1565 do {
1566 c = tok_nextc(tok);
1567 } while (isdigit(c));
1568 {
1569 /* Accept floating point numbers. */
1570 if (c == '.') {
1571 fraction:
1572 /* Fraction */
1573 do {
1574 c = tok_nextc(tok);
1575 } while (isdigit(c));
1576 }
1577 if (c == 'e' || c == 'E') {
1578 exponent:
1579 /* Exponent part */
1580 c = tok_nextc(tok);
1581 if (c == '+' || c == '-')
1582 c = tok_nextc(tok);
1583 if (!isdigit(c)) {
1584 tok->done = E_TOKEN;
1585 tok_backup(tok, c);
1586 return ERRORTOKEN;
1587 }
1588 do {
1589 c = tok_nextc(tok);
1590 } while (isdigit(c));
1591 }
1592 if (c == 'j' || c == 'J')
1593 /* Imaginary part */
1594 imaginary:
1595 c = tok_nextc(tok);
1596 }
1597 }
1598 tok_backup(tok, c);
1599 *p_start = tok->start;
1600 *p_end = tok->cur;
1601 return NUMBER;
1602 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001603
1604 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001605 /* String */
1606 if (c == '\'' || c == '"') {
1607 int quote = c;
1608 int quote_size = 1; /* 1 or 3 */
1609 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001610
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611 /* Find the quote size and start of string */
1612 c = tok_nextc(tok);
1613 if (c == quote) {
1614 c = tok_nextc(tok);
1615 if (c == quote)
1616 quote_size = 3;
1617 else
1618 end_quote_size = 1; /* empty string found */
1619 }
1620 if (c != quote)
1621 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001622
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 /* Get rest of string */
1624 while (end_quote_size != quote_size) {
1625 c = tok_nextc(tok);
1626 if (c == EOF) {
1627 if (quote_size == 3)
1628 tok->done = E_EOFS;
1629 else
1630 tok->done = E_EOLS;
1631 tok->cur = tok->inp;
1632 return ERRORTOKEN;
1633 }
1634 if (quote_size == 1 && c == '\n') {
1635 tok->done = E_EOLS;
1636 tok->cur = tok->inp;
1637 return ERRORTOKEN;
1638 }
1639 if (c == quote)
1640 end_quote_size += 1;
1641 else {
1642 end_quote_size = 0;
1643 if (c == '\\')
1644 c = tok_nextc(tok); /* skip escaped char */
1645 }
1646 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001647
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 *p_start = tok->start;
1649 *p_end = tok->cur;
1650 return STRING;
1651 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001652
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 /* Line continuation */
1654 if (c == '\\') {
1655 c = tok_nextc(tok);
1656 if (c != '\n') {
1657 tok->done = E_LINECONT;
1658 tok->cur = tok->inp;
1659 return ERRORTOKEN;
1660 }
1661 tok->cont_line = 1;
1662 goto again; /* Read next line */
1663 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001664
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 /* Check for two-character token */
1666 {
1667 int c2 = tok_nextc(tok);
1668 int token = PyToken_TwoChars(c, c2);
1669 if (token != OP) {
1670 int c3 = tok_nextc(tok);
1671 int token3 = PyToken_ThreeChars(c, c2, c3);
1672 if (token3 != OP) {
1673 token = token3;
1674 } else {
1675 tok_backup(tok, c3);
1676 }
1677 *p_start = tok->start;
1678 *p_end = tok->cur;
1679 return token;
1680 }
1681 tok_backup(tok, c2);
1682 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001683
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 /* Keep track of parentheses nesting level */
1685 switch (c) {
1686 case '(':
1687 case '[':
1688 case '{':
1689 tok->level++;
1690 break;
1691 case ')':
1692 case ']':
1693 case '}':
1694 tok->level--;
1695 break;
1696 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001697
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 /* Punctuation character */
1699 *p_start = tok->start;
1700 *p_end = tok->cur;
1701 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001702}
1703
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001704int
1705PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1706{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 int result = tok_get(tok, p_start, p_end);
1708 if (tok->decoding_erred) {
1709 result = ERRORTOKEN;
1710 tok->done = E_DECODE;
1711 }
1712 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001713}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001714
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001715/* Get the encoding of a Python file. Check for the coding cookie and check if
1716 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001717
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001718 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1719 encoding in the first or second line of the file (in which case the encoding
1720 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001721
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001722 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1723 by the caller. */
1724
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001725char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001726PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001727{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 struct tok_state *tok;
1729 FILE *fp;
1730 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001731
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 fd = dup(fd);
1733 if (fd < 0) {
1734 return NULL;
1735 }
1736 fp = fdopen(fd, "r");
1737 if (fp == NULL) {
1738 return NULL;
1739 }
1740 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1741 if (tok == NULL) {
1742 fclose(fp);
1743 return NULL;
1744 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001745#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001746 if (filename != NULL) {
1747 Py_INCREF(filename);
1748 tok->filename = filename;
1749 }
1750 else {
1751 tok->filename = PyUnicode_FromString("<string>");
1752 if (tok->filename == NULL) {
1753 fclose(fp);
1754 PyTokenizer_Free(tok);
1755 return encoding;
1756 }
1757 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001758#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 while (tok->lineno < 2 && tok->done == E_OK) {
1760 PyTokenizer_Get(tok, &p_start, &p_end);
1761 }
1762 fclose(fp);
1763 if (tok->encoding) {
1764 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1765 if (encoding)
1766 strcpy(encoding, tok->encoding);
1767 }
1768 PyTokenizer_Free(tok);
1769 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001770}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001771
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001772char *
1773PyTokenizer_FindEncoding(int fd)
1774{
1775 return PyTokenizer_FindEncodingFilename(fd, NULL);
1776}
1777
Guido van Rossum408027e1996-12-30 16:17:54 +00001778#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001779
1780void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001781tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001782{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 printf("%s", _PyParser_TokenNames[type]);
1784 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1785 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001786}
1787
1788#endif