blob: ef7b19fb42f61209a2a92fc26c2a6491cb1dc386 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
106 "<ERRORTOKEN>",
107 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108};
109
110
111/* Create and initialize a new tok_state structure */
112
113static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000114tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000115{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
117 sizeof(struct tok_state));
118 if (tok == NULL)
119 return NULL;
120 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
121 tok->done = E_OK;
122 tok->fp = NULL;
123 tok->input = NULL;
124 tok->tabsize = TABSIZE;
125 tok->indent = 0;
126 tok->indstack[0] = 0;
127 tok->atbol = 1;
128 tok->pendin = 0;
129 tok->prompt = tok->nextprompt = NULL;
130 tok->lineno = 0;
131 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
136 tok->decoding_state = STATE_INIT;
137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
139 tok->enc = NULL;
140 tok->encoding = NULL;
141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200143 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 tok->decoding_readline = NULL;
145 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000146#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000148}
149
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000150static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000152{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700154 if (!result) {
155 tok->done = E_NOMEM;
156 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000157 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700158 memcpy(result, s, len);
159 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000161}
162
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163#ifdef PGEN
164
165static char *
166decoding_fgets(char *s, int size, struct tok_state *tok)
167{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169}
170
171static int
172decoding_feof(struct tok_state *tok)
173{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175}
176
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177static char *
178decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700180 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000181}
182
183#else /* PGEN */
184
185static char *
186error_ret(struct tok_state *tok) /* XXX */
187{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000188 tok->decoding_erred = 1;
189 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
190 PyMem_FREE(tok->buf);
191 tok->buf = NULL;
192 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000193}
194
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195
196static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000199 char buf[13];
200 int i;
201 for (i = 0; i < 12; i++) {
202 int c = s[i];
203 if (c == '\0')
204 break;
205 else if (c == '_')
206 buf[i] = '-';
207 else
208 buf[i] = tolower(c);
209 }
210 buf[i] = '\0';
211 if (strcmp(buf, "utf-8") == 0 ||
212 strncmp(buf, "utf-8-", 6) == 0)
213 return "utf-8";
214 else if (strcmp(buf, "latin-1") == 0 ||
215 strcmp(buf, "iso-8859-1") == 0 ||
216 strcmp(buf, "iso-latin-1") == 0 ||
217 strncmp(buf, "latin-1-", 8) == 0 ||
218 strncmp(buf, "iso-8859-1-", 11) == 0 ||
219 strncmp(buf, "iso-latin-1-", 12) == 0)
220 return "iso-8859-1";
221 else
222 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000223}
224
225/* Return the coding spec in S, or NULL if none is found. */
226
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700227static int
228get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000229{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700231 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 /* Coding spec must be in a comment, and that comment must be
233 * the only statement on the source code line. */
234 for (i = 0; i < size - 6; i++) {
235 if (s[i] == '#')
236 break;
237 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700238 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 }
240 for (; i < size - 6; i++) { /* XXX inefficient search */
241 const char* t = s + i;
242 if (strncmp(t, "coding", 6) == 0) {
243 const char* begin = NULL;
244 t += 6;
245 if (t[0] != ':' && t[0] != '=')
246 continue;
247 do {
248 t++;
249 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 begin = t;
252 while (Py_ISALNUM(t[0]) ||
253 t[0] == '-' || t[0] == '_' || t[0] == '.')
254 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000255
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700257 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700258 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700259 if (!r)
260 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700261 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 if (r != q) {
263 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700264 r = new_string(q, strlen(q), tok);
265 if (!r)
266 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700268 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 }
270 }
271 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700272 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273}
274
275/* Check whether the line contains a coding spec. If it does,
276 invoke the set_readline function for the new encoding.
277 This function receives the tok_state and the new encoding.
278 Return 1 on success, 0 on failure. */
279
280static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000281check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700284 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000286
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200287 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200289 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200291 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700292 if (!get_coding_spec(line, &cs, size, tok))
293 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200294 if (!cs) {
295 Py_ssize_t i;
296 for (i = 0; i < size; i++) {
297 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
298 break;
299 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
300 /* Stop checking coding spec after a line containing
301 * anything except a comment. */
302 tok->read_coding_spec = 1;
303 break;
304 }
305 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700306 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200307 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700308 tok->read_coding_spec = 1;
309 if (tok->encoding == NULL) {
310 assert(tok->decoding_state == STATE_RAW);
311 if (strcmp(cs, "utf-8") == 0) {
312 tok->encoding = cs;
313 } else {
314 r = set_readline(tok, cs);
315 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700317 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300320 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700321 "encoding problem: %s", cs);
322 PyMem_FREE(cs);
323 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700325 } else { /* then, compare cs with BOM */
326 r = (strcmp(tok->encoding, cs) == 0);
327 if (!r)
328 PyErr_Format(PyExc_SyntaxError,
329 "encoding problem: %s with BOM", cs);
330 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333}
334
335/* See whether the file starts with a BOM. If it does,
336 invoke the set_readline function with the new encoding.
337 Return 1 on success, 0 on failure. */
338
339static int
340check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 void unget_char(int, struct tok_state *),
342 int set_readline(struct tok_state *, const char *),
343 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000345 int ch1, ch2, ch3;
346 ch1 = get_char(tok);
347 tok->decoding_state = STATE_RAW;
348 if (ch1 == EOF) {
349 return 1;
350 } else if (ch1 == 0xEF) {
351 ch2 = get_char(tok);
352 if (ch2 != 0xBB) {
353 unget_char(ch2, tok);
354 unget_char(ch1, tok);
355 return 1;
356 }
357 ch3 = get_char(tok);
358 if (ch3 != 0xBF) {
359 unget_char(ch3, tok);
360 unget_char(ch2, tok);
361 unget_char(ch1, tok);
362 return 1;
363 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 /* Disable support for UTF-16 BOMs until a decision
366 is made whether this needs to be supported. */
367 } else if (ch1 == 0xFE) {
368 ch2 = get_char(tok);
369 if (ch2 != 0xFF) {
370 unget_char(ch2, tok);
371 unget_char(ch1, tok);
372 return 1;
373 }
374 if (!set_readline(tok, "utf-16-be"))
375 return 0;
376 tok->decoding_state = STATE_NORMAL;
377 } else if (ch1 == 0xFF) {
378 ch2 = get_char(tok);
379 if (ch2 != 0xFE) {
380 unget_char(ch2, tok);
381 unget_char(ch1, tok);
382 return 1;
383 }
384 if (!set_readline(tok, "utf-16-le"))
385 return 0;
386 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000388 } else {
389 unget_char(ch1, tok);
390 return 1;
391 }
392 if (tok->encoding != NULL)
393 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700394 tok->encoding = new_string("utf-8", 5, tok);
395 if (!tok->encoding)
396 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* No need to set_readline: input is already utf-8 */
398 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399}
400
401/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000402 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000403
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000404 On entry, tok->decoding_buffer will be one of:
405 1) NULL: need to call tok->decoding_readline to get a new line
406 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000408 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 (in the s buffer) to copy entire contents of the line read
410 by tok->decoding_readline. tok->decoding_buffer has the overflow.
411 In this case, fp_readl is called in a loop (with an expanded buffer)
412 until the buffer ends with a '\n' (or until the end of the file is
413 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000414*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000415
416static char *
417fp_readl(char *s, int size, struct tok_state *tok)
418{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 PyObject* bufobj;
420 const char *buf;
421 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 /* Ask for one less byte so we can terminate it */
424 assert(size > 0);
425 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 if (tok->decoding_buffer) {
428 bufobj = tok->decoding_buffer;
429 Py_INCREF(bufobj);
430 }
431 else
432 {
433 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
434 if (bufobj == NULL)
435 goto error;
436 }
437 if (PyUnicode_CheckExact(bufobj))
438 {
439 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
440 if (buf == NULL) {
441 goto error;
442 }
443 }
444 else
445 {
446 buf = PyByteArray_AsString(bufobj);
447 if (buf == NULL) {
448 goto error;
449 }
450 buflen = PyByteArray_GET_SIZE(bufobj);
451 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000452
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 Py_XDECREF(tok->decoding_buffer);
454 if (buflen > size) {
455 /* Too many chars, the rest goes into tok->decoding_buffer */
456 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
457 buflen-size);
458 if (tok->decoding_buffer == NULL)
459 goto error;
460 buflen = size;
461 }
462 else
463 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000464
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 memcpy(s, buf, buflen);
466 s[buflen] = '\0';
467 if (buflen == 0) /* EOF */
468 s = NULL;
469 Py_DECREF(bufobj);
470 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000471
472error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 Py_XDECREF(bufobj);
474 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475}
476
477/* Set the readline function for TOK to a StreamReader's
478 readline function. The StreamReader is named ENC.
479
480 This function is called from check_bom and check_coding_spec.
481
482 ENC is usually identical to the future value of tok->encoding,
483 except for the (currently unsupported) case of UTF-16.
484
485 Return 1 on success, 0 on failure. */
486
487static int
488fp_setreadl(struct tok_state *tok, const char* enc)
489{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000490 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200491 _Py_IDENTIFIER(open);
492 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000493 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200494 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496 io = PyImport_ImportModuleNoBlock("io");
497 if (io == NULL)
498 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000499
Victor Stinner22a351a2010-10-14 12:04:34 +0000500 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200501 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100502 * position of tok->fp. If tok->fp was opened in text mode on Windows,
503 * its file position counts CRLF as one char and can't be directly mapped
504 * to the file offset for fd. Instead we step back one byte and read to
505 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200506 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100507 if (pos == -1 ||
508 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
510 goto cleanup;
511 }
512
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200513 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000514 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000515 if (stream == NULL)
516 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200519 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000520 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100521 if (pos > 0) {
522 if (PyObject_CallObject(readline, NULL) == NULL) {
523 readline = NULL;
524 goto cleanup;
525 }
526 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000527
528 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000529 Py_XDECREF(stream);
530 Py_XDECREF(io);
531 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532}
533
534/* Fetch the next byte from TOK. */
535
536static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538}
539
540/* Unfetch the last byte back into TOK. */
541
542static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000543 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544}
545
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000546/* Check whether the characters at s start a valid
547 UTF-8 sequence. Return the number of characters forming
548 the sequence if yes, 0 if not. */
549static int valid_utf8(const unsigned char* s)
550{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 int expected = 0;
552 int length;
553 if (*s < 0x80)
554 /* single-byte code */
555 return 1;
556 if (*s < 0xc0)
557 /* following byte */
558 return 0;
559 if (*s < 0xE0)
560 expected = 1;
561 else if (*s < 0xF0)
562 expected = 2;
563 else if (*s < 0xF8)
564 expected = 3;
565 else
566 return 0;
567 length = expected + 1;
568 for (; expected; expected--)
569 if (s[expected] < 0x80 || s[expected] >= 0xC0)
570 return 0;
571 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000572}
573
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574/* Read a line of input from TOK. Determine encoding
575 if necessary. */
576
577static char *
578decoding_fgets(char *s, int size, struct tok_state *tok)
579{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000580 char *line = NULL;
581 int badchar = 0;
582 for (;;) {
583 if (tok->decoding_state == STATE_NORMAL) {
584 /* We already have a codec associated with
585 this input. */
586 line = fp_readl(s, size, tok);
587 break;
588 } else if (tok->decoding_state == STATE_RAW) {
589 /* We want a 'raw' read. */
590 line = Py_UniversalNewlineFgets(s, size,
591 tok->fp, NULL);
592 break;
593 } else {
594 /* We have not yet determined the encoding.
595 If an encoding is found, use the file-pointer
596 reader functions from now on. */
597 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
598 return error_ret(tok);
599 assert(tok->decoding_state != STATE_INIT);
600 }
601 }
602 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
603 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
604 return error_ret(tok);
605 }
606 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 /* The default encoding is UTF-8, so make sure we don't have any
609 non-UTF-8 sequences in it. */
610 if (line && !tok->encoding) {
611 unsigned char *c;
612 int length;
613 for (c = (unsigned char *)line; *c; c += length)
614 if (!(length = valid_utf8(c))) {
615 badchar = *c;
616 break;
617 }
618 }
619 if (badchar) {
620 /* Need to add 1 to the line number, since this line
621 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200622 PyErr_Format(PyExc_SyntaxError,
623 "Non-UTF-8 code starting with '\\x%.2x' "
624 "in file %U on line %i, "
625 "but no encoding declared; "
626 "see http://python.org/dev/peps/pep-0263/ for details",
627 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 return error_ret(tok);
629 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
634static int
635decoding_feof(struct tok_state *tok)
636{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 if (tok->decoding_state != STATE_NORMAL) {
638 return feof(tok->fp);
639 } else {
640 PyObject* buf = tok->decoding_buffer;
641 if (buf == NULL) {
642 buf = PyObject_CallObject(tok->decoding_readline, NULL);
643 if (buf == NULL) {
644 error_ret(tok);
645 return 1;
646 } else {
647 tok->decoding_buffer = buf;
648 }
649 }
650 return PyObject_Length(buf) == 0;
651 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652}
653
654/* Fetch a byte from TOK, using the string buffer. */
655
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000656static int
657buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659}
660
661/* Unfetch a byte from TOK, using the string buffer. */
662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000663static void
664buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 tok->str--;
666 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667}
668
669/* Set the readline function for TOK to ENC. For the string-based
670 tokenizer, this means to just record the encoding. */
671
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000672static int
673buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 tok->enc = enc;
675 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676}
677
678/* Return a UTF-8 encoding Python string object from the
679 C byte string STR, which is encoded with ENC. */
680
681static PyObject *
682translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 PyObject *utf8;
684 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
685 if (buf == NULL)
686 return NULL;
687 utf8 = PyUnicode_AsUTF8String(buf);
688 Py_DECREF(buf);
689 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000690}
691
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000692
693static char *
694translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200695 int skip_next_lf = 0;
696 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000697 char *buf, *current;
698 char c = '\0';
699 buf = PyMem_MALLOC(needed_length);
700 if (buf == NULL) {
701 tok->done = E_NOMEM;
702 return NULL;
703 }
704 for (current = buf; *s; s++, current++) {
705 c = *s;
706 if (skip_next_lf) {
707 skip_next_lf = 0;
708 if (c == '\n') {
709 c = *++s;
710 if (!c)
711 break;
712 }
713 }
714 if (c == '\r') {
715 skip_next_lf = 1;
716 c = '\n';
717 }
718 *current = c;
719 }
720 /* If this is exec input, add a newline to the end of the string if
721 there isn't one already. */
722 if (exec_input && c != '\n') {
723 *current = '\n';
724 current++;
725 }
726 *current = '\0';
727 final_length = current - buf + 1;
728 if (final_length < needed_length && final_length)
729 /* should never fail */
730 buf = PyMem_REALLOC(buf, final_length);
731 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000732}
733
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734/* Decode a byte string STR for use as the buffer of TOK.
735 Look for encoding declarations inside STR, and record them
736 inside TOK. */
737
738static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000739decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000740{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741 PyObject* utf8 = NULL;
742 const char *str;
743 const char *s;
744 const char *newl[2] = {NULL, NULL};
745 int lineno = 0;
746 tok->input = str = translate_newlines(input, single, tok);
747 if (str == NULL)
748 return NULL;
749 tok->enc = NULL;
750 tok->str = str;
751 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
752 return error_ret(tok);
753 str = tok->str; /* string after BOM if any */
754 assert(str);
755 if (tok->enc != NULL) {
756 utf8 = translate_into_utf8(str, tok->enc);
757 if (utf8 == NULL)
758 return error_ret(tok);
759 str = PyBytes_AsString(utf8);
760 }
761 for (s = str;; s++) {
762 if (*s == '\0') break;
763 else if (*s == '\n') {
764 assert(lineno < 2);
765 newl[lineno] = s;
766 lineno++;
767 if (lineno == 2) break;
768 }
769 }
770 tok->enc = NULL;
771 /* need to check line 1 and 2 separately since check_coding_spec
772 assumes a single line as input */
773 if (newl[0]) {
774 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
775 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200776 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
778 tok, buf_setreadl))
779 return error_ret(tok);
780 }
781 }
782 if (tok->enc != NULL) {
783 assert(utf8 == NULL);
784 utf8 = translate_into_utf8(str, tok->enc);
785 if (utf8 == NULL)
786 return error_ret(tok);
787 str = PyBytes_AS_STRING(utf8);
788 }
789 assert(tok->decoding_buffer == NULL);
790 tok->decoding_buffer = utf8; /* CAUTION */
791 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000792}
793
794#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000795
796/* Set up tokenizer for string */
797
798struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000799PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 struct tok_state *tok = tok_new();
802 if (tok == NULL)
803 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300804 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 if (str == NULL) {
806 PyTokenizer_Free(tok);
807 return NULL;
808 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000809
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 /* XXX: constify members. */
811 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
812 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813}
814
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000815struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000816PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000817{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 struct tok_state *tok = tok_new();
819 if (tok == NULL)
820 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000821#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000823#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 if (str == NULL) {
825 PyTokenizer_Free(tok);
826 return NULL;
827 }
828 tok->decoding_state = STATE_RAW;
829 tok->read_coding_spec = 1;
830 tok->enc = NULL;
831 tok->str = str;
832 tok->encoding = (char *)PyMem_MALLOC(6);
833 if (!tok->encoding) {
834 PyTokenizer_Free(tok);
835 return NULL;
836 }
837 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000838
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 /* XXX: constify members. */
840 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
841 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000842}
843
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000844/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845
846struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300847PyTokenizer_FromFile(FILE *fp, const char* enc,
848 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 struct tok_state *tok = tok_new();
851 if (tok == NULL)
852 return NULL;
853 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
854 PyTokenizer_Free(tok);
855 return NULL;
856 }
857 tok->cur = tok->inp = tok->buf;
858 tok->end = tok->buf + BUFSIZ;
859 tok->fp = fp;
860 tok->prompt = ps1;
861 tok->nextprompt = ps2;
862 if (enc != NULL) {
863 /* Must copy encoding declaration since it
864 gets copied into the parse tree. */
865 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
866 if (!tok->encoding) {
867 PyTokenizer_Free(tok);
868 return NULL;
869 }
870 strcpy(tok->encoding, enc);
871 tok->decoding_state = STATE_NORMAL;
872 }
873 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000874}
875
876
877/* Free a tok_state structure */
878
879void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000880PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 if (tok->encoding != NULL)
883 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000884#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 Py_XDECREF(tok->decoding_readline);
886 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200887 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000888#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (tok->fp != NULL && tok->buf != NULL)
890 PyMem_FREE(tok->buf);
891 if (tok->input)
892 PyMem_FREE((char *)tok->input);
893 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000894}
895
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896/* Get next char, updating state; error code goes into tok->done */
897
898static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200899tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000900{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 for (;;) {
902 if (tok->cur != tok->inp) {
903 return Py_CHARMASK(*tok->cur++); /* Fast path */
904 }
905 if (tok->done != E_OK)
906 return EOF;
907 if (tok->fp == NULL) {
908 char *end = strchr(tok->inp, '\n');
909 if (end != NULL)
910 end++;
911 else {
912 end = strchr(tok->inp, '\0');
913 if (end == tok->inp) {
914 tok->done = E_EOF;
915 return EOF;
916 }
917 }
918 if (tok->start == NULL)
919 tok->buf = tok->cur;
920 tok->line_start = tok->cur;
921 tok->lineno++;
922 tok->inp = end;
923 return Py_CHARMASK(*tok->cur++);
924 }
925 if (tok->prompt != NULL) {
926 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000927#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000928 if (newtok != NULL) {
929 char *translated = translate_newlines(newtok, 0, tok);
930 PyMem_FREE(newtok);
931 if (translated == NULL)
932 return EOF;
933 newtok = translated;
934 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 if (tok->encoding && newtok && *newtok) {
936 /* Recode to UTF-8 */
937 Py_ssize_t buflen;
938 const char* buf;
939 PyObject *u = translate_into_utf8(newtok, tok->encoding);
940 PyMem_FREE(newtok);
941 if (!u) {
942 tok->done = E_DECODE;
943 return EOF;
944 }
945 buflen = PyBytes_GET_SIZE(u);
946 buf = PyBytes_AS_STRING(u);
947 if (!buf) {
948 Py_DECREF(u);
949 tok->done = E_DECODE;
950 return EOF;
951 }
952 newtok = PyMem_MALLOC(buflen+1);
953 strcpy(newtok, buf);
954 Py_DECREF(u);
955 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000956#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000957 if (tok->nextprompt != NULL)
958 tok->prompt = tok->nextprompt;
959 if (newtok == NULL)
960 tok->done = E_INTR;
961 else if (*newtok == '\0') {
962 PyMem_FREE(newtok);
963 tok->done = E_EOF;
964 }
965 else if (tok->start != NULL) {
966 size_t start = tok->start - tok->buf;
967 size_t oldlen = tok->cur - tok->buf;
968 size_t newlen = oldlen + strlen(newtok);
969 char *buf = tok->buf;
970 buf = (char *)PyMem_REALLOC(buf, newlen+1);
971 tok->lineno++;
972 if (buf == NULL) {
973 PyMem_FREE(tok->buf);
974 tok->buf = NULL;
975 PyMem_FREE(newtok);
976 tok->done = E_NOMEM;
977 return EOF;
978 }
979 tok->buf = buf;
980 tok->cur = tok->buf + oldlen;
981 tok->line_start = tok->cur;
982 strcpy(tok->buf + oldlen, newtok);
983 PyMem_FREE(newtok);
984 tok->inp = tok->buf + newlen;
985 tok->end = tok->inp + 1;
986 tok->start = tok->buf + start;
987 }
988 else {
989 tok->lineno++;
990 if (tok->buf != NULL)
991 PyMem_FREE(tok->buf);
992 tok->buf = newtok;
993 tok->line_start = tok->buf;
994 tok->cur = tok->buf;
995 tok->line_start = tok->buf;
996 tok->inp = strchr(tok->buf, '\0');
997 tok->end = tok->inp + 1;
998 }
999 }
1000 else {
1001 int done = 0;
1002 Py_ssize_t cur = 0;
1003 char *pt;
1004 if (tok->start == NULL) {
1005 if (tok->buf == NULL) {
1006 tok->buf = (char *)
1007 PyMem_MALLOC(BUFSIZ);
1008 if (tok->buf == NULL) {
1009 tok->done = E_NOMEM;
1010 return EOF;
1011 }
1012 tok->end = tok->buf + BUFSIZ;
1013 }
1014 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1015 tok) == NULL) {
1016 tok->done = E_EOF;
1017 done = 1;
1018 }
1019 else {
1020 tok->done = E_OK;
1021 tok->inp = strchr(tok->buf, '\0');
1022 done = tok->inp[-1] == '\n';
1023 }
1024 }
1025 else {
1026 cur = tok->cur - tok->buf;
1027 if (decoding_feof(tok)) {
1028 tok->done = E_EOF;
1029 done = 1;
1030 }
1031 else
1032 tok->done = E_OK;
1033 }
1034 tok->lineno++;
1035 /* Read until '\n' or EOF */
1036 while (!done) {
1037 Py_ssize_t curstart = tok->start == NULL ? -1 :
1038 tok->start - tok->buf;
1039 Py_ssize_t curvalid = tok->inp - tok->buf;
1040 Py_ssize_t newsize = curvalid + BUFSIZ;
1041 char *newbuf = tok->buf;
1042 newbuf = (char *)PyMem_REALLOC(newbuf,
1043 newsize);
1044 if (newbuf == NULL) {
1045 tok->done = E_NOMEM;
1046 tok->cur = tok->inp;
1047 return EOF;
1048 }
1049 tok->buf = newbuf;
1050 tok->inp = tok->buf + curvalid;
1051 tok->end = tok->buf + newsize;
1052 tok->start = curstart < 0 ? NULL :
1053 tok->buf + curstart;
1054 if (decoding_fgets(tok->inp,
1055 (int)(tok->end - tok->inp),
1056 tok) == NULL) {
1057 /* Break out early on decoding
1058 errors, as tok->buf will be NULL
1059 */
1060 if (tok->decoding_erred)
1061 return EOF;
1062 /* Last line does not end in \n,
1063 fake one */
1064 strcpy(tok->inp, "\n");
1065 }
1066 tok->inp = strchr(tok->inp, '\0');
1067 done = tok->inp[-1] == '\n';
1068 }
1069 if (tok->buf != NULL) {
1070 tok->cur = tok->buf + cur;
1071 tok->line_start = tok->cur;
1072 /* replace "\r\n" with "\n" */
1073 /* For Mac leave the \r, giving a syntax error */
1074 pt = tok->inp - 2;
1075 if (pt >= tok->buf && *pt == '\r') {
1076 *pt++ = '\n';
1077 *pt = '\0';
1078 tok->inp = pt;
1079 }
1080 }
1081 }
1082 if (tok->done != E_OK) {
1083 if (tok->prompt != NULL)
1084 PySys_WriteStderr("\n");
1085 tok->cur = tok->inp;
1086 return EOF;
1087 }
1088 }
1089 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001090}
1091
1092
1093/* Back-up one character */
1094
1095static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001096tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 if (c != EOF) {
1099 if (--tok->cur < tok->buf)
1100 Py_FatalError("tok_backup: beginning of buffer");
1101 if (*tok->cur != c)
1102 *tok->cur = c;
1103 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104}
1105
1106
1107/* Return the token corresponding to a single character */
1108
1109int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001110PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001111{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 switch (c) {
1113 case '(': return LPAR;
1114 case ')': return RPAR;
1115 case '[': return LSQB;
1116 case ']': return RSQB;
1117 case ':': return COLON;
1118 case ',': return COMMA;
1119 case ';': return SEMI;
1120 case '+': return PLUS;
1121 case '-': return MINUS;
1122 case '*': return STAR;
1123 case '/': return SLASH;
1124 case '|': return VBAR;
1125 case '&': return AMPER;
1126 case '<': return LESS;
1127 case '>': return GREATER;
1128 case '=': return EQUAL;
1129 case '.': return DOT;
1130 case '%': return PERCENT;
1131 case '{': return LBRACE;
1132 case '}': return RBRACE;
1133 case '^': return CIRCUMFLEX;
1134 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001135 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 default: return OP;
1137 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138}
1139
1140
Guido van Rossumfbab9051991-10-20 20:25:03 +00001141int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001142PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001143{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 switch (c1) {
1145 case '=':
1146 switch (c2) {
1147 case '=': return EQEQUAL;
1148 }
1149 break;
1150 case '!':
1151 switch (c2) {
1152 case '=': return NOTEQUAL;
1153 }
1154 break;
1155 case '<':
1156 switch (c2) {
1157 case '>': return NOTEQUAL;
1158 case '=': return LESSEQUAL;
1159 case '<': return LEFTSHIFT;
1160 }
1161 break;
1162 case '>':
1163 switch (c2) {
1164 case '=': return GREATEREQUAL;
1165 case '>': return RIGHTSHIFT;
1166 }
1167 break;
1168 case '+':
1169 switch (c2) {
1170 case '=': return PLUSEQUAL;
1171 }
1172 break;
1173 case '-':
1174 switch (c2) {
1175 case '=': return MINEQUAL;
1176 case '>': return RARROW;
1177 }
1178 break;
1179 case '*':
1180 switch (c2) {
1181 case '*': return DOUBLESTAR;
1182 case '=': return STAREQUAL;
1183 }
1184 break;
1185 case '/':
1186 switch (c2) {
1187 case '/': return DOUBLESLASH;
1188 case '=': return SLASHEQUAL;
1189 }
1190 break;
1191 case '|':
1192 switch (c2) {
1193 case '=': return VBAREQUAL;
1194 }
1195 break;
1196 case '%':
1197 switch (c2) {
1198 case '=': return PERCENTEQUAL;
1199 }
1200 break;
1201 case '&':
1202 switch (c2) {
1203 case '=': return AMPEREQUAL;
1204 }
1205 break;
1206 case '^':
1207 switch (c2) {
1208 case '=': return CIRCUMFLEXEQUAL;
1209 }
1210 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001211 case '@':
1212 switch (c2) {
1213 case '=': return ATEQUAL;
1214 }
1215 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001216 }
1217 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001218}
1219
Thomas Wouters434d0822000-08-24 20:11:32 +00001220int
1221PyToken_ThreeChars(int c1, int c2, int c3)
1222{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 switch (c1) {
1224 case '<':
1225 switch (c2) {
1226 case '<':
1227 switch (c3) {
1228 case '=':
1229 return LEFTSHIFTEQUAL;
1230 }
1231 break;
1232 }
1233 break;
1234 case '>':
1235 switch (c2) {
1236 case '>':
1237 switch (c3) {
1238 case '=':
1239 return RIGHTSHIFTEQUAL;
1240 }
1241 break;
1242 }
1243 break;
1244 case '*':
1245 switch (c2) {
1246 case '*':
1247 switch (c3) {
1248 case '=':
1249 return DOUBLESTAREQUAL;
1250 }
1251 break;
1252 }
1253 break;
1254 case '/':
1255 switch (c2) {
1256 case '/':
1257 switch (c3) {
1258 case '=':
1259 return DOUBLESLASHEQUAL;
1260 }
1261 break;
1262 }
1263 break;
1264 case '.':
1265 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001266 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 switch (c3) {
1268 case '.':
1269 return ELLIPSIS;
1270 }
1271 break;
1272 }
1273 break;
1274 }
1275 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001276}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001277
Guido van Rossum926f13a1998-04-09 21:38:06 +00001278static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001279indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001280{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001281 if (tok->alterror) {
1282 tok->done = E_TABSPACE;
1283 tok->cur = tok->inp;
1284 return 1;
1285 }
1286 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001287#ifdef PGEN
1288 PySys_WriteStderr("inconsistent use of tabs and spaces "
1289 "in indentation\n");
1290#else
1291 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001293#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 tok->altwarning = 0;
1295 }
1296 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001297}
1298
Martin v. Löwis47383402007-08-15 07:32:56 +00001299#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001300#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001301#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302/* Verify that the identifier follows PEP 3131.
1303 All identifier strings are guaranteed to be "ready" unicode objects.
1304 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001305static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001306verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001307{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 PyObject *s;
1309 int result;
1310 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1313 PyErr_Clear();
1314 tok->done = E_IDENTIFIER;
1315 } else {
1316 tok->done = E_ERROR;
1317 }
1318 return 0;
1319 }
1320 result = PyUnicode_IsIdentifier(s);
1321 Py_DECREF(s);
1322 if (result == 0)
1323 tok->done = E_IDENTIFIER;
1324 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001325}
1326#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001327
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328/* Get next token, after space stripping etc. */
1329
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001330static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001331tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001332{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001333 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001335
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001337 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 tok->start = NULL;
1339 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001340
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 /* Get indentation level */
1342 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001343 int col = 0;
1344 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 tok->atbol = 0;
1346 for (;;) {
1347 c = tok_nextc(tok);
1348 if (c == ' ')
1349 col++, altcol++;
1350 else if (c == '\t') {
1351 col = (col/tok->tabsize + 1) * tok->tabsize;
1352 altcol = (altcol/tok->alttabsize + 1)
1353 * tok->alttabsize;
1354 }
1355 else if (c == '\014') /* Control-L (formfeed) */
1356 col = altcol = 0; /* For Emacs users */
1357 else
1358 break;
1359 }
1360 tok_backup(tok, c);
1361 if (c == '#' || c == '\n') {
1362 /* Lines with only whitespace and/or comments
1363 shouldn't affect the indentation and are
1364 not passed to the parser as NEWLINE tokens,
1365 except *totally* empty lines in interactive
1366 mode, which signal the end of a command group. */
1367 if (col == 0 && c == '\n' && tok->prompt != NULL)
1368 blankline = 0; /* Let it through */
1369 else
1370 blankline = 1; /* Ignore completely */
1371 /* We can't jump back right here since we still
1372 may need to skip to the end of a comment */
1373 }
1374 if (!blankline && tok->level == 0) {
1375 if (col == tok->indstack[tok->indent]) {
1376 /* No change */
1377 if (altcol != tok->altindstack[tok->indent]) {
1378 if (indenterror(tok))
1379 return ERRORTOKEN;
1380 }
1381 }
1382 else if (col > tok->indstack[tok->indent]) {
1383 /* Indent -- always one */
1384 if (tok->indent+1 >= MAXINDENT) {
1385 tok->done = E_TOODEEP;
1386 tok->cur = tok->inp;
1387 return ERRORTOKEN;
1388 }
1389 if (altcol <= tok->altindstack[tok->indent]) {
1390 if (indenterror(tok))
1391 return ERRORTOKEN;
1392 }
1393 tok->pendin++;
1394 tok->indstack[++tok->indent] = col;
1395 tok->altindstack[tok->indent] = altcol;
1396 }
1397 else /* col < tok->indstack[tok->indent] */ {
1398 /* Dedent -- any number, must be consistent */
1399 while (tok->indent > 0 &&
1400 col < tok->indstack[tok->indent]) {
1401 tok->pendin--;
1402 tok->indent--;
1403 }
1404 if (col != tok->indstack[tok->indent]) {
1405 tok->done = E_DEDENT;
1406 tok->cur = tok->inp;
1407 return ERRORTOKEN;
1408 }
1409 if (altcol != tok->altindstack[tok->indent]) {
1410 if (indenterror(tok))
1411 return ERRORTOKEN;
1412 }
1413 }
1414 }
1415 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419 /* Return pending indents/dedents */
1420 if (tok->pendin != 0) {
1421 if (tok->pendin < 0) {
1422 tok->pendin++;
1423 return DEDENT;
1424 }
1425 else {
1426 tok->pendin--;
1427 return INDENT;
1428 }
1429 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001430
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001431 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 tok->start = NULL;
1433 /* Skip spaces */
1434 do {
1435 c = tok_nextc(tok);
1436 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 /* Set start of current token */
1439 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 /* Skip comment */
1442 if (c == '#')
1443 while (c != EOF && c != '\n')
1444 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001445
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001446 /* Check for EOF and errors now */
1447 if (c == EOF) {
1448 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1449 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 /* Identifier (most frequent token!) */
1452 nonascii = 0;
1453 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001454 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001455 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001456 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001457 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001458 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001459 /* Since this is a backwards compatibility support literal we don't
1460 want to support it in arbitrary order like byte literals. */
1461 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1462 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001463 /* ur"" and ru"" are not supported */
1464 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001465 saw_r = 1;
1466 else
1467 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001468 c = tok_nextc(tok);
1469 if (c == '"' || c == '\'')
1470 goto letter_quote;
1471 }
1472 while (is_potential_identifier_char(c)) {
1473 if (c >= 128)
1474 nonascii = 1;
1475 c = tok_nextc(tok);
1476 }
1477 tok_backup(tok, c);
1478 if (nonascii &&
1479 !verify_identifier(tok)) {
1480 tok->done = E_IDENTIFIER;
1481 return ERRORTOKEN;
1482 }
1483 *p_start = tok->start;
1484 *p_end = tok->cur;
1485 return NAME;
1486 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001488 /* Newline */
1489 if (c == '\n') {
1490 tok->atbol = 1;
1491 if (blankline || tok->level > 0)
1492 goto nextline;
1493 *p_start = tok->start;
1494 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1495 tok->cont_line = 0;
1496 return NEWLINE;
1497 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 /* Period or number starting with period? */
1500 if (c == '.') {
1501 c = tok_nextc(tok);
1502 if (isdigit(c)) {
1503 goto fraction;
1504 } else if (c == '.') {
1505 c = tok_nextc(tok);
1506 if (c == '.') {
1507 *p_start = tok->start;
1508 *p_end = tok->cur;
1509 return ELLIPSIS;
1510 } else {
1511 tok_backup(tok, c);
1512 }
1513 tok_backup(tok, '.');
1514 } else {
1515 tok_backup(tok, c);
1516 }
1517 *p_start = tok->start;
1518 *p_end = tok->cur;
1519 return DOT;
1520 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001521
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001522 /* Number */
1523 if (isdigit(c)) {
1524 if (c == '0') {
1525 /* Hex, octal or binary -- maybe. */
1526 c = tok_nextc(tok);
1527 if (c == '.')
1528 goto fraction;
1529 if (c == 'j' || c == 'J')
1530 goto imaginary;
1531 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001532
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001533 /* Hex */
1534 c = tok_nextc(tok);
1535 if (!isxdigit(c)) {
1536 tok->done = E_TOKEN;
1537 tok_backup(tok, c);
1538 return ERRORTOKEN;
1539 }
1540 do {
1541 c = tok_nextc(tok);
1542 } while (isxdigit(c));
1543 }
1544 else if (c == 'o' || c == 'O') {
1545 /* Octal */
1546 c = tok_nextc(tok);
1547 if (c < '0' || c >= '8') {
1548 tok->done = E_TOKEN;
1549 tok_backup(tok, c);
1550 return ERRORTOKEN;
1551 }
1552 do {
1553 c = tok_nextc(tok);
1554 } while ('0' <= c && c < '8');
1555 }
1556 else if (c == 'b' || c == 'B') {
1557 /* Binary */
1558 c = tok_nextc(tok);
1559 if (c != '0' && c != '1') {
1560 tok->done = E_TOKEN;
1561 tok_backup(tok, c);
1562 return ERRORTOKEN;
1563 }
1564 do {
1565 c = tok_nextc(tok);
1566 } while (c == '0' || c == '1');
1567 }
1568 else {
1569 int nonzero = 0;
1570 /* maybe old-style octal; c is first char of it */
1571 /* in any case, allow '0' as a literal */
1572 while (c == '0')
1573 c = tok_nextc(tok);
1574 while (isdigit(c)) {
1575 nonzero = 1;
1576 c = tok_nextc(tok);
1577 }
1578 if (c == '.')
1579 goto fraction;
1580 else if (c == 'e' || c == 'E')
1581 goto exponent;
1582 else if (c == 'j' || c == 'J')
1583 goto imaginary;
1584 else if (nonzero) {
1585 tok->done = E_TOKEN;
1586 tok_backup(tok, c);
1587 return ERRORTOKEN;
1588 }
1589 }
1590 }
1591 else {
1592 /* Decimal */
1593 do {
1594 c = tok_nextc(tok);
1595 } while (isdigit(c));
1596 {
1597 /* Accept floating point numbers. */
1598 if (c == '.') {
1599 fraction:
1600 /* Fraction */
1601 do {
1602 c = tok_nextc(tok);
1603 } while (isdigit(c));
1604 }
1605 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001606 int e;
1607 exponent:
1608 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 /* Exponent part */
1610 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001611 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001613 if (!isdigit(c)) {
1614 tok->done = E_TOKEN;
1615 tok_backup(tok, c);
1616 return ERRORTOKEN;
1617 }
1618 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001619 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001620 tok_backup(tok, e);
1621 *p_start = tok->start;
1622 *p_end = tok->cur;
1623 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 }
1625 do {
1626 c = tok_nextc(tok);
1627 } while (isdigit(c));
1628 }
1629 if (c == 'j' || c == 'J')
1630 /* Imaginary part */
1631 imaginary:
1632 c = tok_nextc(tok);
1633 }
1634 }
1635 tok_backup(tok, c);
1636 *p_start = tok->start;
1637 *p_end = tok->cur;
1638 return NUMBER;
1639 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001640
1641 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 /* String */
1643 if (c == '\'' || c == '"') {
1644 int quote = c;
1645 int quote_size = 1; /* 1 or 3 */
1646 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001647
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 /* Find the quote size and start of string */
1649 c = tok_nextc(tok);
1650 if (c == quote) {
1651 c = tok_nextc(tok);
1652 if (c == quote)
1653 quote_size = 3;
1654 else
1655 end_quote_size = 1; /* empty string found */
1656 }
1657 if (c != quote)
1658 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001659
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 /* Get rest of string */
1661 while (end_quote_size != quote_size) {
1662 c = tok_nextc(tok);
1663 if (c == EOF) {
1664 if (quote_size == 3)
1665 tok->done = E_EOFS;
1666 else
1667 tok->done = E_EOLS;
1668 tok->cur = tok->inp;
1669 return ERRORTOKEN;
1670 }
1671 if (quote_size == 1 && c == '\n') {
1672 tok->done = E_EOLS;
1673 tok->cur = tok->inp;
1674 return ERRORTOKEN;
1675 }
1676 if (c == quote)
1677 end_quote_size += 1;
1678 else {
1679 end_quote_size = 0;
1680 if (c == '\\')
1681 c = tok_nextc(tok); /* skip escaped char */
1682 }
1683 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001684
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 *p_start = tok->start;
1686 *p_end = tok->cur;
1687 return STRING;
1688 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001689
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690 /* Line continuation */
1691 if (c == '\\') {
1692 c = tok_nextc(tok);
1693 if (c != '\n') {
1694 tok->done = E_LINECONT;
1695 tok->cur = tok->inp;
1696 return ERRORTOKEN;
1697 }
1698 tok->cont_line = 1;
1699 goto again; /* Read next line */
1700 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001701
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 /* Check for two-character token */
1703 {
1704 int c2 = tok_nextc(tok);
1705 int token = PyToken_TwoChars(c, c2);
1706 if (token != OP) {
1707 int c3 = tok_nextc(tok);
1708 int token3 = PyToken_ThreeChars(c, c2, c3);
1709 if (token3 != OP) {
1710 token = token3;
1711 } else {
1712 tok_backup(tok, c3);
1713 }
1714 *p_start = tok->start;
1715 *p_end = tok->cur;
1716 return token;
1717 }
1718 tok_backup(tok, c2);
1719 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001720
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 /* Keep track of parentheses nesting level */
1722 switch (c) {
1723 case '(':
1724 case '[':
1725 case '{':
1726 tok->level++;
1727 break;
1728 case ')':
1729 case ']':
1730 case '}':
1731 tok->level--;
1732 break;
1733 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001734
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 /* Punctuation character */
1736 *p_start = tok->start;
1737 *p_end = tok->cur;
1738 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001739}
1740
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001741int
1742PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1743{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 int result = tok_get(tok, p_start, p_end);
1745 if (tok->decoding_erred) {
1746 result = ERRORTOKEN;
1747 tok->done = E_DECODE;
1748 }
1749 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001750}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001751
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001752/* Get the encoding of a Python file. Check for the coding cookie and check if
1753 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001754
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001755 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1756 encoding in the first or second line of the file (in which case the encoding
1757 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001758
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001759 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1760 by the caller. */
1761
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001762char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001763PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001764{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 struct tok_state *tok;
1766 FILE *fp;
1767 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001768
Victor Stinnerdaf45552013-08-28 00:53:59 +02001769#ifndef PGEN
1770 fd = _Py_dup(fd);
1771#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001773#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 if (fd < 0) {
1775 return NULL;
1776 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001777
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001778 fp = fdopen(fd, "r");
1779 if (fp == NULL) {
1780 return NULL;
1781 }
1782 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1783 if (tok == NULL) {
1784 fclose(fp);
1785 return NULL;
1786 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001787#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001788 if (filename != NULL) {
1789 Py_INCREF(filename);
1790 tok->filename = filename;
1791 }
1792 else {
1793 tok->filename = PyUnicode_FromString("<string>");
1794 if (tok->filename == NULL) {
1795 fclose(fp);
1796 PyTokenizer_Free(tok);
1797 return encoding;
1798 }
1799 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001800#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001801 while (tok->lineno < 2 && tok->done == E_OK) {
1802 PyTokenizer_Get(tok, &p_start, &p_end);
1803 }
1804 fclose(fp);
1805 if (tok->encoding) {
1806 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1807 if (encoding)
1808 strcpy(encoding, tok->encoding);
1809 }
1810 PyTokenizer_Free(tok);
1811 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001812}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001813
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001814char *
1815PyTokenizer_FindEncoding(int fd)
1816{
1817 return PyTokenizer_FindEncodingFilename(fd, NULL);
1818}
1819
Guido van Rossum408027e1996-12-30 16:17:54 +00001820#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001821
1822void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001823tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001824{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001825 printf("%s", _PyParser_TokenNames[type]);
1826 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1827 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001828}
1829
1830#endif