blob: 1a8cf40c2ce44822edcbbec493f38e322f6258fa [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200190 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
191 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000193}
194
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195
196static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000199 char buf[13];
200 int i;
201 for (i = 0; i < 12; i++) {
202 int c = s[i];
203 if (c == '\0')
204 break;
205 else if (c == '_')
206 buf[i] = '-';
207 else
208 buf[i] = tolower(c);
209 }
210 buf[i] = '\0';
211 if (strcmp(buf, "utf-8") == 0 ||
212 strncmp(buf, "utf-8-", 6) == 0)
213 return "utf-8";
214 else if (strcmp(buf, "latin-1") == 0 ||
215 strcmp(buf, "iso-8859-1") == 0 ||
216 strcmp(buf, "iso-latin-1") == 0 ||
217 strncmp(buf, "latin-1-", 8) == 0 ||
218 strncmp(buf, "iso-8859-1-", 11) == 0 ||
219 strncmp(buf, "iso-latin-1-", 12) == 0)
220 return "iso-8859-1";
221 else
222 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000223}
224
225/* Return the coding spec in S, or NULL if none is found. */
226
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700227static int
228get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000229{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700231 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 /* Coding spec must be in a comment, and that comment must be
233 * the only statement on the source code line. */
234 for (i = 0; i < size - 6; i++) {
235 if (s[i] == '#')
236 break;
237 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700238 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 }
240 for (; i < size - 6; i++) { /* XXX inefficient search */
241 const char* t = s + i;
242 if (strncmp(t, "coding", 6) == 0) {
243 const char* begin = NULL;
244 t += 6;
245 if (t[0] != ':' && t[0] != '=')
246 continue;
247 do {
248 t++;
249 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 begin = t;
252 while (Py_ISALNUM(t[0]) ||
253 t[0] == '-' || t[0] == '_' || t[0] == '.')
254 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000255
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700257 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700258 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700259 if (!r)
260 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700261 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 if (r != q) {
263 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700264 r = new_string(q, strlen(q), tok);
265 if (!r)
266 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700268 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 }
270 }
271 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700272 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273}
274
275/* Check whether the line contains a coding spec. If it does,
276 invoke the set_readline function for the new encoding.
277 This function receives the tok_state and the new encoding.
278 Return 1 on success, 0 on failure. */
279
280static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000281check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700284 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000286
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200287 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200289 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200291 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700292 if (!get_coding_spec(line, &cs, size, tok))
293 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200294 if (!cs) {
295 Py_ssize_t i;
296 for (i = 0; i < size; i++) {
297 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
298 break;
299 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
300 /* Stop checking coding spec after a line containing
301 * anything except a comment. */
302 tok->read_coding_spec = 1;
303 break;
304 }
305 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700306 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200307 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700308 tok->read_coding_spec = 1;
309 if (tok->encoding == NULL) {
310 assert(tok->decoding_state == STATE_RAW);
311 if (strcmp(cs, "utf-8") == 0) {
312 tok->encoding = cs;
313 } else {
314 r = set_readline(tok, cs);
315 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700317 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300320 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700321 "encoding problem: %s", cs);
322 PyMem_FREE(cs);
323 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700325 } else { /* then, compare cs with BOM */
326 r = (strcmp(tok->encoding, cs) == 0);
327 if (!r)
328 PyErr_Format(PyExc_SyntaxError,
329 "encoding problem: %s with BOM", cs);
330 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333}
334
335/* See whether the file starts with a BOM. If it does,
336 invoke the set_readline function with the new encoding.
337 Return 1 on success, 0 on failure. */
338
339static int
340check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 void unget_char(int, struct tok_state *),
342 int set_readline(struct tok_state *, const char *),
343 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000345 int ch1, ch2, ch3;
346 ch1 = get_char(tok);
347 tok->decoding_state = STATE_RAW;
348 if (ch1 == EOF) {
349 return 1;
350 } else if (ch1 == 0xEF) {
351 ch2 = get_char(tok);
352 if (ch2 != 0xBB) {
353 unget_char(ch2, tok);
354 unget_char(ch1, tok);
355 return 1;
356 }
357 ch3 = get_char(tok);
358 if (ch3 != 0xBF) {
359 unget_char(ch3, tok);
360 unget_char(ch2, tok);
361 unget_char(ch1, tok);
362 return 1;
363 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 /* Disable support for UTF-16 BOMs until a decision
366 is made whether this needs to be supported. */
367 } else if (ch1 == 0xFE) {
368 ch2 = get_char(tok);
369 if (ch2 != 0xFF) {
370 unget_char(ch2, tok);
371 unget_char(ch1, tok);
372 return 1;
373 }
374 if (!set_readline(tok, "utf-16-be"))
375 return 0;
376 tok->decoding_state = STATE_NORMAL;
377 } else if (ch1 == 0xFF) {
378 ch2 = get_char(tok);
379 if (ch2 != 0xFE) {
380 unget_char(ch2, tok);
381 unget_char(ch1, tok);
382 return 1;
383 }
384 if (!set_readline(tok, "utf-16-le"))
385 return 0;
386 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000388 } else {
389 unget_char(ch1, tok);
390 return 1;
391 }
392 if (tok->encoding != NULL)
393 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700394 tok->encoding = new_string("utf-8", 5, tok);
395 if (!tok->encoding)
396 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* No need to set_readline: input is already utf-8 */
398 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399}
400
401/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000402 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000403
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000404 On entry, tok->decoding_buffer will be one of:
405 1) NULL: need to call tok->decoding_readline to get a new line
406 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000408 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 (in the s buffer) to copy entire contents of the line read
410 by tok->decoding_readline. tok->decoding_buffer has the overflow.
411 In this case, fp_readl is called in a loop (with an expanded buffer)
412 until the buffer ends with a '\n' (or until the end of the file is
413 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000414*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000415
416static char *
417fp_readl(char *s, int size, struct tok_state *tok)
418{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 PyObject* bufobj;
420 const char *buf;
421 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 /* Ask for one less byte so we can terminate it */
424 assert(size > 0);
425 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 if (tok->decoding_buffer) {
428 bufobj = tok->decoding_buffer;
429 Py_INCREF(bufobj);
430 }
431 else
432 {
433 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
434 if (bufobj == NULL)
435 goto error;
436 }
437 if (PyUnicode_CheckExact(bufobj))
438 {
439 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
440 if (buf == NULL) {
441 goto error;
442 }
443 }
444 else
445 {
446 buf = PyByteArray_AsString(bufobj);
447 if (buf == NULL) {
448 goto error;
449 }
450 buflen = PyByteArray_GET_SIZE(bufobj);
451 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000452
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 Py_XDECREF(tok->decoding_buffer);
454 if (buflen > size) {
455 /* Too many chars, the rest goes into tok->decoding_buffer */
456 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
457 buflen-size);
458 if (tok->decoding_buffer == NULL)
459 goto error;
460 buflen = size;
461 }
462 else
463 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000464
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 memcpy(s, buf, buflen);
466 s[buflen] = '\0';
467 if (buflen == 0) /* EOF */
468 s = NULL;
469 Py_DECREF(bufobj);
470 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000471
472error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 Py_XDECREF(bufobj);
474 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475}
476
477/* Set the readline function for TOK to a StreamReader's
478 readline function. The StreamReader is named ENC.
479
480 This function is called from check_bom and check_coding_spec.
481
482 ENC is usually identical to the future value of tok->encoding,
483 except for the (currently unsupported) case of UTF-16.
484
485 Return 1 on success, 0 on failure. */
486
487static int
488fp_setreadl(struct tok_state *tok, const char* enc)
489{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000490 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200491 _Py_IDENTIFIER(open);
492 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000493 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200494 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496 io = PyImport_ImportModuleNoBlock("io");
497 if (io == NULL)
498 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000499
Victor Stinner22a351a2010-10-14 12:04:34 +0000500 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200501 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100502 * position of tok->fp. If tok->fp was opened in text mode on Windows,
503 * its file position counts CRLF as one char and can't be directly mapped
504 * to the file offset for fd. Instead we step back one byte and read to
505 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200506 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100507 if (pos == -1 ||
508 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
510 goto cleanup;
511 }
512
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200513 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000514 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000515 if (stream == NULL)
516 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200519 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000520 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100521 if (pos > 0) {
522 if (PyObject_CallObject(readline, NULL) == NULL) {
523 readline = NULL;
524 goto cleanup;
525 }
526 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000527
528 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000529 Py_XDECREF(stream);
530 Py_XDECREF(io);
531 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532}
533
534/* Fetch the next byte from TOK. */
535
536static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538}
539
540/* Unfetch the last byte back into TOK. */
541
542static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000543 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544}
545
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000546/* Check whether the characters at s start a valid
547 UTF-8 sequence. Return the number of characters forming
548 the sequence if yes, 0 if not. */
549static int valid_utf8(const unsigned char* s)
550{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 int expected = 0;
552 int length;
553 if (*s < 0x80)
554 /* single-byte code */
555 return 1;
556 if (*s < 0xc0)
557 /* following byte */
558 return 0;
559 if (*s < 0xE0)
560 expected = 1;
561 else if (*s < 0xF0)
562 expected = 2;
563 else if (*s < 0xF8)
564 expected = 3;
565 else
566 return 0;
567 length = expected + 1;
568 for (; expected; expected--)
569 if (s[expected] < 0x80 || s[expected] >= 0xC0)
570 return 0;
571 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000572}
573
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574/* Read a line of input from TOK. Determine encoding
575 if necessary. */
576
577static char *
578decoding_fgets(char *s, int size, struct tok_state *tok)
579{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000580 char *line = NULL;
581 int badchar = 0;
582 for (;;) {
583 if (tok->decoding_state == STATE_NORMAL) {
584 /* We already have a codec associated with
585 this input. */
586 line = fp_readl(s, size, tok);
587 break;
588 } else if (tok->decoding_state == STATE_RAW) {
589 /* We want a 'raw' read. */
590 line = Py_UniversalNewlineFgets(s, size,
591 tok->fp, NULL);
592 break;
593 } else {
594 /* We have not yet determined the encoding.
595 If an encoding is found, use the file-pointer
596 reader functions from now on. */
597 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
598 return error_ret(tok);
599 assert(tok->decoding_state != STATE_INIT);
600 }
601 }
602 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
603 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
604 return error_ret(tok);
605 }
606 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 /* The default encoding is UTF-8, so make sure we don't have any
609 non-UTF-8 sequences in it. */
610 if (line && !tok->encoding) {
611 unsigned char *c;
612 int length;
613 for (c = (unsigned char *)line; *c; c += length)
614 if (!(length = valid_utf8(c))) {
615 badchar = *c;
616 break;
617 }
618 }
619 if (badchar) {
620 /* Need to add 1 to the line number, since this line
621 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200622 PyErr_Format(PyExc_SyntaxError,
623 "Non-UTF-8 code starting with '\\x%.2x' "
624 "in file %U on line %i, "
625 "but no encoding declared; "
626 "see http://python.org/dev/peps/pep-0263/ for details",
627 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 return error_ret(tok);
629 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
634static int
635decoding_feof(struct tok_state *tok)
636{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 if (tok->decoding_state != STATE_NORMAL) {
638 return feof(tok->fp);
639 } else {
640 PyObject* buf = tok->decoding_buffer;
641 if (buf == NULL) {
642 buf = PyObject_CallObject(tok->decoding_readline, NULL);
643 if (buf == NULL) {
644 error_ret(tok);
645 return 1;
646 } else {
647 tok->decoding_buffer = buf;
648 }
649 }
650 return PyObject_Length(buf) == 0;
651 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652}
653
654/* Fetch a byte from TOK, using the string buffer. */
655
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000656static int
657buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659}
660
661/* Unfetch a byte from TOK, using the string buffer. */
662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000663static void
664buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 tok->str--;
666 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667}
668
669/* Set the readline function for TOK to ENC. For the string-based
670 tokenizer, this means to just record the encoding. */
671
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000672static int
673buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 tok->enc = enc;
675 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676}
677
678/* Return a UTF-8 encoding Python string object from the
679 C byte string STR, which is encoded with ENC. */
680
681static PyObject *
682translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 PyObject *utf8;
684 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
685 if (buf == NULL)
686 return NULL;
687 utf8 = PyUnicode_AsUTF8String(buf);
688 Py_DECREF(buf);
689 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000690}
691
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000692
693static char *
694translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200695 int skip_next_lf = 0;
696 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000697 char *buf, *current;
698 char c = '\0';
699 buf = PyMem_MALLOC(needed_length);
700 if (buf == NULL) {
701 tok->done = E_NOMEM;
702 return NULL;
703 }
704 for (current = buf; *s; s++, current++) {
705 c = *s;
706 if (skip_next_lf) {
707 skip_next_lf = 0;
708 if (c == '\n') {
709 c = *++s;
710 if (!c)
711 break;
712 }
713 }
714 if (c == '\r') {
715 skip_next_lf = 1;
716 c = '\n';
717 }
718 *current = c;
719 }
720 /* If this is exec input, add a newline to the end of the string if
721 there isn't one already. */
722 if (exec_input && c != '\n') {
723 *current = '\n';
724 current++;
725 }
726 *current = '\0';
727 final_length = current - buf + 1;
728 if (final_length < needed_length && final_length)
729 /* should never fail */
730 buf = PyMem_REALLOC(buf, final_length);
731 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000732}
733
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734/* Decode a byte string STR for use as the buffer of TOK.
735 Look for encoding declarations inside STR, and record them
736 inside TOK. */
737
738static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000739decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000740{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741 PyObject* utf8 = NULL;
742 const char *str;
743 const char *s;
744 const char *newl[2] = {NULL, NULL};
745 int lineno = 0;
746 tok->input = str = translate_newlines(input, single, tok);
747 if (str == NULL)
748 return NULL;
749 tok->enc = NULL;
750 tok->str = str;
751 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
752 return error_ret(tok);
753 str = tok->str; /* string after BOM if any */
754 assert(str);
755 if (tok->enc != NULL) {
756 utf8 = translate_into_utf8(str, tok->enc);
757 if (utf8 == NULL)
758 return error_ret(tok);
759 str = PyBytes_AsString(utf8);
760 }
761 for (s = str;; s++) {
762 if (*s == '\0') break;
763 else if (*s == '\n') {
764 assert(lineno < 2);
765 newl[lineno] = s;
766 lineno++;
767 if (lineno == 2) break;
768 }
769 }
770 tok->enc = NULL;
771 /* need to check line 1 and 2 separately since check_coding_spec
772 assumes a single line as input */
773 if (newl[0]) {
774 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
775 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200776 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
778 tok, buf_setreadl))
779 return error_ret(tok);
780 }
781 }
782 if (tok->enc != NULL) {
783 assert(utf8 == NULL);
784 utf8 = translate_into_utf8(str, tok->enc);
785 if (utf8 == NULL)
786 return error_ret(tok);
787 str = PyBytes_AS_STRING(utf8);
788 }
789 assert(tok->decoding_buffer == NULL);
790 tok->decoding_buffer = utf8; /* CAUTION */
791 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000792}
793
794#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000795
796/* Set up tokenizer for string */
797
798struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000799PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 struct tok_state *tok = tok_new();
802 if (tok == NULL)
803 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300804 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 if (str == NULL) {
806 PyTokenizer_Free(tok);
807 return NULL;
808 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000809
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 /* XXX: constify members. */
811 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
812 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813}
814
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000815struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000816PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000817{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 struct tok_state *tok = tok_new();
819 if (tok == NULL)
820 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000821#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000823#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 if (str == NULL) {
825 PyTokenizer_Free(tok);
826 return NULL;
827 }
828 tok->decoding_state = STATE_RAW;
829 tok->read_coding_spec = 1;
830 tok->enc = NULL;
831 tok->str = str;
832 tok->encoding = (char *)PyMem_MALLOC(6);
833 if (!tok->encoding) {
834 PyTokenizer_Free(tok);
835 return NULL;
836 }
837 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000838
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 /* XXX: constify members. */
840 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
841 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000842}
843
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000844/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845
846struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300847PyTokenizer_FromFile(FILE *fp, const char* enc,
848 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 struct tok_state *tok = tok_new();
851 if (tok == NULL)
852 return NULL;
853 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
854 PyTokenizer_Free(tok);
855 return NULL;
856 }
857 tok->cur = tok->inp = tok->buf;
858 tok->end = tok->buf + BUFSIZ;
859 tok->fp = fp;
860 tok->prompt = ps1;
861 tok->nextprompt = ps2;
862 if (enc != NULL) {
863 /* Must copy encoding declaration since it
864 gets copied into the parse tree. */
865 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
866 if (!tok->encoding) {
867 PyTokenizer_Free(tok);
868 return NULL;
869 }
870 strcpy(tok->encoding, enc);
871 tok->decoding_state = STATE_NORMAL;
872 }
873 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000874}
875
876
877/* Free a tok_state structure */
878
879void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000880PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 if (tok->encoding != NULL)
883 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000884#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 Py_XDECREF(tok->decoding_readline);
886 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200887 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000888#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (tok->fp != NULL && tok->buf != NULL)
890 PyMem_FREE(tok->buf);
891 if (tok->input)
892 PyMem_FREE((char *)tok->input);
893 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000894}
895
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896/* Get next char, updating state; error code goes into tok->done */
897
898static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200899tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000900{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 for (;;) {
902 if (tok->cur != tok->inp) {
903 return Py_CHARMASK(*tok->cur++); /* Fast path */
904 }
905 if (tok->done != E_OK)
906 return EOF;
907 if (tok->fp == NULL) {
908 char *end = strchr(tok->inp, '\n');
909 if (end != NULL)
910 end++;
911 else {
912 end = strchr(tok->inp, '\0');
913 if (end == tok->inp) {
914 tok->done = E_EOF;
915 return EOF;
916 }
917 }
918 if (tok->start == NULL)
919 tok->buf = tok->cur;
920 tok->line_start = tok->cur;
921 tok->lineno++;
922 tok->inp = end;
923 return Py_CHARMASK(*tok->cur++);
924 }
925 if (tok->prompt != NULL) {
926 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000927#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000928 if (newtok != NULL) {
929 char *translated = translate_newlines(newtok, 0, tok);
930 PyMem_FREE(newtok);
931 if (translated == NULL)
932 return EOF;
933 newtok = translated;
934 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 if (tok->encoding && newtok && *newtok) {
936 /* Recode to UTF-8 */
937 Py_ssize_t buflen;
938 const char* buf;
939 PyObject *u = translate_into_utf8(newtok, tok->encoding);
940 PyMem_FREE(newtok);
941 if (!u) {
942 tok->done = E_DECODE;
943 return EOF;
944 }
945 buflen = PyBytes_GET_SIZE(u);
946 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000947 newtok = PyMem_MALLOC(buflen+1);
948 strcpy(newtok, buf);
949 Py_DECREF(u);
950 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000951#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000952 if (tok->nextprompt != NULL)
953 tok->prompt = tok->nextprompt;
954 if (newtok == NULL)
955 tok->done = E_INTR;
956 else if (*newtok == '\0') {
957 PyMem_FREE(newtok);
958 tok->done = E_EOF;
959 }
960 else if (tok->start != NULL) {
961 size_t start = tok->start - tok->buf;
962 size_t oldlen = tok->cur - tok->buf;
963 size_t newlen = oldlen + strlen(newtok);
964 char *buf = tok->buf;
965 buf = (char *)PyMem_REALLOC(buf, newlen+1);
966 tok->lineno++;
967 if (buf == NULL) {
968 PyMem_FREE(tok->buf);
969 tok->buf = NULL;
970 PyMem_FREE(newtok);
971 tok->done = E_NOMEM;
972 return EOF;
973 }
974 tok->buf = buf;
975 tok->cur = tok->buf + oldlen;
976 tok->line_start = tok->cur;
977 strcpy(tok->buf + oldlen, newtok);
978 PyMem_FREE(newtok);
979 tok->inp = tok->buf + newlen;
980 tok->end = tok->inp + 1;
981 tok->start = tok->buf + start;
982 }
983 else {
984 tok->lineno++;
985 if (tok->buf != NULL)
986 PyMem_FREE(tok->buf);
987 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000988 tok->cur = tok->buf;
989 tok->line_start = tok->buf;
990 tok->inp = strchr(tok->buf, '\0');
991 tok->end = tok->inp + 1;
992 }
993 }
994 else {
995 int done = 0;
996 Py_ssize_t cur = 0;
997 char *pt;
998 if (tok->start == NULL) {
999 if (tok->buf == NULL) {
1000 tok->buf = (char *)
1001 PyMem_MALLOC(BUFSIZ);
1002 if (tok->buf == NULL) {
1003 tok->done = E_NOMEM;
1004 return EOF;
1005 }
1006 tok->end = tok->buf + BUFSIZ;
1007 }
1008 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1009 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001010 if (!tok->decoding_erred)
1011 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 done = 1;
1013 }
1014 else {
1015 tok->done = E_OK;
1016 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -07001017 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001018 }
1019 }
1020 else {
1021 cur = tok->cur - tok->buf;
1022 if (decoding_feof(tok)) {
1023 tok->done = E_EOF;
1024 done = 1;
1025 }
1026 else
1027 tok->done = E_OK;
1028 }
1029 tok->lineno++;
1030 /* Read until '\n' or EOF */
1031 while (!done) {
1032 Py_ssize_t curstart = tok->start == NULL ? -1 :
1033 tok->start - tok->buf;
1034 Py_ssize_t curvalid = tok->inp - tok->buf;
1035 Py_ssize_t newsize = curvalid + BUFSIZ;
1036 char *newbuf = tok->buf;
1037 newbuf = (char *)PyMem_REALLOC(newbuf,
1038 newsize);
1039 if (newbuf == NULL) {
1040 tok->done = E_NOMEM;
1041 tok->cur = tok->inp;
1042 return EOF;
1043 }
1044 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001045 tok->cur = tok->buf + cur;
1046 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047 tok->inp = tok->buf + curvalid;
1048 tok->end = tok->buf + newsize;
1049 tok->start = curstart < 0 ? NULL :
1050 tok->buf + curstart;
1051 if (decoding_fgets(tok->inp,
1052 (int)(tok->end - tok->inp),
1053 tok) == NULL) {
1054 /* Break out early on decoding
1055 errors, as tok->buf will be NULL
1056 */
1057 if (tok->decoding_erred)
1058 return EOF;
1059 /* Last line does not end in \n,
1060 fake one */
1061 strcpy(tok->inp, "\n");
1062 }
1063 tok->inp = strchr(tok->inp, '\0');
1064 done = tok->inp[-1] == '\n';
1065 }
1066 if (tok->buf != NULL) {
1067 tok->cur = tok->buf + cur;
1068 tok->line_start = tok->cur;
1069 /* replace "\r\n" with "\n" */
1070 /* For Mac leave the \r, giving a syntax error */
1071 pt = tok->inp - 2;
1072 if (pt >= tok->buf && *pt == '\r') {
1073 *pt++ = '\n';
1074 *pt = '\0';
1075 tok->inp = pt;
1076 }
1077 }
1078 }
1079 if (tok->done != E_OK) {
1080 if (tok->prompt != NULL)
1081 PySys_WriteStderr("\n");
1082 tok->cur = tok->inp;
1083 return EOF;
1084 }
1085 }
1086 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087}
1088
1089
1090/* Back-up one character */
1091
1092static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001093tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001094{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 if (c != EOF) {
1096 if (--tok->cur < tok->buf)
1097 Py_FatalError("tok_backup: beginning of buffer");
1098 if (*tok->cur != c)
1099 *tok->cur = c;
1100 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101}
1102
1103
1104/* Return the token corresponding to a single character */
1105
1106int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001107PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 switch (c) {
1110 case '(': return LPAR;
1111 case ')': return RPAR;
1112 case '[': return LSQB;
1113 case ']': return RSQB;
1114 case ':': return COLON;
1115 case ',': return COMMA;
1116 case ';': return SEMI;
1117 case '+': return PLUS;
1118 case '-': return MINUS;
1119 case '*': return STAR;
1120 case '/': return SLASH;
1121 case '|': return VBAR;
1122 case '&': return AMPER;
1123 case '<': return LESS;
1124 case '>': return GREATER;
1125 case '=': return EQUAL;
1126 case '.': return DOT;
1127 case '%': return PERCENT;
1128 case '{': return LBRACE;
1129 case '}': return RBRACE;
1130 case '^': return CIRCUMFLEX;
1131 case '~': return TILDE;
1132 case '@': return AT;
1133 default: return OP;
1134 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135}
1136
1137
Guido van Rossumfbab9051991-10-20 20:25:03 +00001138int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001139PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001140{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 switch (c1) {
1142 case '=':
1143 switch (c2) {
1144 case '=': return EQEQUAL;
1145 }
1146 break;
1147 case '!':
1148 switch (c2) {
1149 case '=': return NOTEQUAL;
1150 }
1151 break;
1152 case '<':
1153 switch (c2) {
1154 case '>': return NOTEQUAL;
1155 case '=': return LESSEQUAL;
1156 case '<': return LEFTSHIFT;
1157 }
1158 break;
1159 case '>':
1160 switch (c2) {
1161 case '=': return GREATEREQUAL;
1162 case '>': return RIGHTSHIFT;
1163 }
1164 break;
1165 case '+':
1166 switch (c2) {
1167 case '=': return PLUSEQUAL;
1168 }
1169 break;
1170 case '-':
1171 switch (c2) {
1172 case '=': return MINEQUAL;
1173 case '>': return RARROW;
1174 }
1175 break;
1176 case '*':
1177 switch (c2) {
1178 case '*': return DOUBLESTAR;
1179 case '=': return STAREQUAL;
1180 }
1181 break;
1182 case '/':
1183 switch (c2) {
1184 case '/': return DOUBLESLASH;
1185 case '=': return SLASHEQUAL;
1186 }
1187 break;
1188 case '|':
1189 switch (c2) {
1190 case '=': return VBAREQUAL;
1191 }
1192 break;
1193 case '%':
1194 switch (c2) {
1195 case '=': return PERCENTEQUAL;
1196 }
1197 break;
1198 case '&':
1199 switch (c2) {
1200 case '=': return AMPEREQUAL;
1201 }
1202 break;
1203 case '^':
1204 switch (c2) {
1205 case '=': return CIRCUMFLEXEQUAL;
1206 }
1207 break;
1208 }
1209 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001210}
1211
Thomas Wouters434d0822000-08-24 20:11:32 +00001212int
1213PyToken_ThreeChars(int c1, int c2, int c3)
1214{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 switch (c1) {
1216 case '<':
1217 switch (c2) {
1218 case '<':
1219 switch (c3) {
1220 case '=':
1221 return LEFTSHIFTEQUAL;
1222 }
1223 break;
1224 }
1225 break;
1226 case '>':
1227 switch (c2) {
1228 case '>':
1229 switch (c3) {
1230 case '=':
1231 return RIGHTSHIFTEQUAL;
1232 }
1233 break;
1234 }
1235 break;
1236 case '*':
1237 switch (c2) {
1238 case '*':
1239 switch (c3) {
1240 case '=':
1241 return DOUBLESTAREQUAL;
1242 }
1243 break;
1244 }
1245 break;
1246 case '/':
1247 switch (c2) {
1248 case '/':
1249 switch (c3) {
1250 case '=':
1251 return DOUBLESLASHEQUAL;
1252 }
1253 break;
1254 }
1255 break;
1256 case '.':
1257 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001258 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 switch (c3) {
1260 case '.':
1261 return ELLIPSIS;
1262 }
1263 break;
1264 }
1265 break;
1266 }
1267 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001268}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001269
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001271indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001272{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 if (tok->alterror) {
1274 tok->done = E_TABSPACE;
1275 tok->cur = tok->inp;
1276 return 1;
1277 }
1278 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001279#ifdef PGEN
1280 PySys_WriteStderr("inconsistent use of tabs and spaces "
1281 "in indentation\n");
1282#else
1283 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001285#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 tok->altwarning = 0;
1287 }
1288 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001289}
1290
Martin v. Löwis47383402007-08-15 07:32:56 +00001291#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001292#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001293#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294/* Verify that the identifier follows PEP 3131.
1295 All identifier strings are guaranteed to be "ready" unicode objects.
1296 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001297static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001298verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001299{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 PyObject *s;
1301 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001302 if (tok->decoding_erred)
1303 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1307 PyErr_Clear();
1308 tok->done = E_IDENTIFIER;
1309 } else {
1310 tok->done = E_ERROR;
1311 }
1312 return 0;
1313 }
1314 result = PyUnicode_IsIdentifier(s);
1315 Py_DECREF(s);
1316 if (result == 0)
1317 tok->done = E_IDENTIFIER;
1318 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001319}
1320#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001321
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322/* Get next token, after space stripping etc. */
1323
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001324static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001325tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001327 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001329
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001331 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 tok->start = NULL;
1333 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001334
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 /* Get indentation level */
1336 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001337 int col = 0;
1338 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 tok->atbol = 0;
1340 for (;;) {
1341 c = tok_nextc(tok);
1342 if (c == ' ')
1343 col++, altcol++;
1344 else if (c == '\t') {
1345 col = (col/tok->tabsize + 1) * tok->tabsize;
1346 altcol = (altcol/tok->alttabsize + 1)
1347 * tok->alttabsize;
1348 }
1349 else if (c == '\014') /* Control-L (formfeed) */
1350 col = altcol = 0; /* For Emacs users */
1351 else
1352 break;
1353 }
1354 tok_backup(tok, c);
1355 if (c == '#' || c == '\n') {
1356 /* Lines with only whitespace and/or comments
1357 shouldn't affect the indentation and are
1358 not passed to the parser as NEWLINE tokens,
1359 except *totally* empty lines in interactive
1360 mode, which signal the end of a command group. */
1361 if (col == 0 && c == '\n' && tok->prompt != NULL)
1362 blankline = 0; /* Let it through */
1363 else
1364 blankline = 1; /* Ignore completely */
1365 /* We can't jump back right here since we still
1366 may need to skip to the end of a comment */
1367 }
1368 if (!blankline && tok->level == 0) {
1369 if (col == tok->indstack[tok->indent]) {
1370 /* No change */
1371 if (altcol != tok->altindstack[tok->indent]) {
1372 if (indenterror(tok))
1373 return ERRORTOKEN;
1374 }
1375 }
1376 else if (col > tok->indstack[tok->indent]) {
1377 /* Indent -- always one */
1378 if (tok->indent+1 >= MAXINDENT) {
1379 tok->done = E_TOODEEP;
1380 tok->cur = tok->inp;
1381 return ERRORTOKEN;
1382 }
1383 if (altcol <= tok->altindstack[tok->indent]) {
1384 if (indenterror(tok))
1385 return ERRORTOKEN;
1386 }
1387 tok->pendin++;
1388 tok->indstack[++tok->indent] = col;
1389 tok->altindstack[tok->indent] = altcol;
1390 }
1391 else /* col < tok->indstack[tok->indent] */ {
1392 /* Dedent -- any number, must be consistent */
1393 while (tok->indent > 0 &&
1394 col < tok->indstack[tok->indent]) {
1395 tok->pendin--;
1396 tok->indent--;
1397 }
1398 if (col != tok->indstack[tok->indent]) {
1399 tok->done = E_DEDENT;
1400 tok->cur = tok->inp;
1401 return ERRORTOKEN;
1402 }
1403 if (altcol != tok->altindstack[tok->indent]) {
1404 if (indenterror(tok))
1405 return ERRORTOKEN;
1406 }
1407 }
1408 }
1409 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 /* Return pending indents/dedents */
1414 if (tok->pendin != 0) {
1415 if (tok->pendin < 0) {
1416 tok->pendin++;
1417 return DEDENT;
1418 }
1419 else {
1420 tok->pendin--;
1421 return INDENT;
1422 }
1423 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426 tok->start = NULL;
1427 /* Skip spaces */
1428 do {
1429 c = tok_nextc(tok);
1430 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001431
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 /* Set start of current token */
1433 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 /* Skip comment */
1436 if (c == '#')
1437 while (c != EOF && c != '\n')
1438 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001439
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 /* Check for EOF and errors now */
1441 if (c == EOF) {
1442 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1443 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 /* Identifier (most frequent token!) */
1446 nonascii = 0;
1447 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001448 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001449 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001450 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001451 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001452 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001453 /* Since this is a backwards compatibility support literal we don't
1454 want to support it in arbitrary order like byte literals. */
1455 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1456 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001457 /* ur"" and ru"" are not supported */
1458 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001459 saw_r = 1;
1460 else
1461 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 c = tok_nextc(tok);
1463 if (c == '"' || c == '\'')
1464 goto letter_quote;
1465 }
1466 while (is_potential_identifier_char(c)) {
1467 if (c >= 128)
1468 nonascii = 1;
1469 c = tok_nextc(tok);
1470 }
1471 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001472 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001474 *p_start = tok->start;
1475 *p_end = tok->cur;
1476 return NAME;
1477 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001478
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001479 /* Newline */
1480 if (c == '\n') {
1481 tok->atbol = 1;
1482 if (blankline || tok->level > 0)
1483 goto nextline;
1484 *p_start = tok->start;
1485 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1486 tok->cont_line = 0;
1487 return NEWLINE;
1488 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001489
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 /* Period or number starting with period? */
1491 if (c == '.') {
1492 c = tok_nextc(tok);
1493 if (isdigit(c)) {
1494 goto fraction;
1495 } else if (c == '.') {
1496 c = tok_nextc(tok);
1497 if (c == '.') {
1498 *p_start = tok->start;
1499 *p_end = tok->cur;
1500 return ELLIPSIS;
1501 } else {
1502 tok_backup(tok, c);
1503 }
1504 tok_backup(tok, '.');
1505 } else {
1506 tok_backup(tok, c);
1507 }
1508 *p_start = tok->start;
1509 *p_end = tok->cur;
1510 return DOT;
1511 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001512
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 /* Number */
1514 if (isdigit(c)) {
1515 if (c == '0') {
1516 /* Hex, octal or binary -- maybe. */
1517 c = tok_nextc(tok);
1518 if (c == '.')
1519 goto fraction;
1520 if (c == 'j' || c == 'J')
1521 goto imaginary;
1522 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001523
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 /* Hex */
1525 c = tok_nextc(tok);
1526 if (!isxdigit(c)) {
1527 tok->done = E_TOKEN;
1528 tok_backup(tok, c);
1529 return ERRORTOKEN;
1530 }
1531 do {
1532 c = tok_nextc(tok);
1533 } while (isxdigit(c));
1534 }
1535 else if (c == 'o' || c == 'O') {
1536 /* Octal */
1537 c = tok_nextc(tok);
1538 if (c < '0' || c >= '8') {
1539 tok->done = E_TOKEN;
1540 tok_backup(tok, c);
1541 return ERRORTOKEN;
1542 }
1543 do {
1544 c = tok_nextc(tok);
1545 } while ('0' <= c && c < '8');
1546 }
1547 else if (c == 'b' || c == 'B') {
1548 /* Binary */
1549 c = tok_nextc(tok);
1550 if (c != '0' && c != '1') {
1551 tok->done = E_TOKEN;
1552 tok_backup(tok, c);
1553 return ERRORTOKEN;
1554 }
1555 do {
1556 c = tok_nextc(tok);
1557 } while (c == '0' || c == '1');
1558 }
1559 else {
1560 int nonzero = 0;
1561 /* maybe old-style octal; c is first char of it */
1562 /* in any case, allow '0' as a literal */
1563 while (c == '0')
1564 c = tok_nextc(tok);
1565 while (isdigit(c)) {
1566 nonzero = 1;
1567 c = tok_nextc(tok);
1568 }
1569 if (c == '.')
1570 goto fraction;
1571 else if (c == 'e' || c == 'E')
1572 goto exponent;
1573 else if (c == 'j' || c == 'J')
1574 goto imaginary;
1575 else if (nonzero) {
1576 tok->done = E_TOKEN;
1577 tok_backup(tok, c);
1578 return ERRORTOKEN;
1579 }
1580 }
1581 }
1582 else {
1583 /* Decimal */
1584 do {
1585 c = tok_nextc(tok);
1586 } while (isdigit(c));
1587 {
1588 /* Accept floating point numbers. */
1589 if (c == '.') {
1590 fraction:
1591 /* Fraction */
1592 do {
1593 c = tok_nextc(tok);
1594 } while (isdigit(c));
1595 }
1596 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001597 int e;
1598 exponent:
1599 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 /* Exponent part */
1601 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001602 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001604 if (!isdigit(c)) {
1605 tok->done = E_TOKEN;
1606 tok_backup(tok, c);
1607 return ERRORTOKEN;
1608 }
1609 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001611 tok_backup(tok, e);
1612 *p_start = tok->start;
1613 *p_end = tok->cur;
1614 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001615 }
1616 do {
1617 c = tok_nextc(tok);
1618 } while (isdigit(c));
1619 }
1620 if (c == 'j' || c == 'J')
1621 /* Imaginary part */
1622 imaginary:
1623 c = tok_nextc(tok);
1624 }
1625 }
1626 tok_backup(tok, c);
1627 *p_start = tok->start;
1628 *p_end = tok->cur;
1629 return NUMBER;
1630 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001631
1632 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 /* String */
1634 if (c == '\'' || c == '"') {
1635 int quote = c;
1636 int quote_size = 1; /* 1 or 3 */
1637 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001638
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001639 /* Find the quote size and start of string */
1640 c = tok_nextc(tok);
1641 if (c == quote) {
1642 c = tok_nextc(tok);
1643 if (c == quote)
1644 quote_size = 3;
1645 else
1646 end_quote_size = 1; /* empty string found */
1647 }
1648 if (c != quote)
1649 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001650
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 /* Get rest of string */
1652 while (end_quote_size != quote_size) {
1653 c = tok_nextc(tok);
1654 if (c == EOF) {
1655 if (quote_size == 3)
1656 tok->done = E_EOFS;
1657 else
1658 tok->done = E_EOLS;
1659 tok->cur = tok->inp;
1660 return ERRORTOKEN;
1661 }
1662 if (quote_size == 1 && c == '\n') {
1663 tok->done = E_EOLS;
1664 tok->cur = tok->inp;
1665 return ERRORTOKEN;
1666 }
1667 if (c == quote)
1668 end_quote_size += 1;
1669 else {
1670 end_quote_size = 0;
1671 if (c == '\\')
1672 c = tok_nextc(tok); /* skip escaped char */
1673 }
1674 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001675
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676 *p_start = tok->start;
1677 *p_end = tok->cur;
1678 return STRING;
1679 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001680
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 /* Line continuation */
1682 if (c == '\\') {
1683 c = tok_nextc(tok);
1684 if (c != '\n') {
1685 tok->done = E_LINECONT;
1686 tok->cur = tok->inp;
1687 return ERRORTOKEN;
1688 }
1689 tok->cont_line = 1;
1690 goto again; /* Read next line */
1691 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001692
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 /* Check for two-character token */
1694 {
1695 int c2 = tok_nextc(tok);
1696 int token = PyToken_TwoChars(c, c2);
1697 if (token != OP) {
1698 int c3 = tok_nextc(tok);
1699 int token3 = PyToken_ThreeChars(c, c2, c3);
1700 if (token3 != OP) {
1701 token = token3;
1702 } else {
1703 tok_backup(tok, c3);
1704 }
1705 *p_start = tok->start;
1706 *p_end = tok->cur;
1707 return token;
1708 }
1709 tok_backup(tok, c2);
1710 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001711
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001712 /* Keep track of parentheses nesting level */
1713 switch (c) {
1714 case '(':
1715 case '[':
1716 case '{':
1717 tok->level++;
1718 break;
1719 case ')':
1720 case ']':
1721 case '}':
1722 tok->level--;
1723 break;
1724 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001725
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001726 /* Punctuation character */
1727 *p_start = tok->start;
1728 *p_end = tok->cur;
1729 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001730}
1731
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001732int
1733PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1734{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 int result = tok_get(tok, p_start, p_end);
1736 if (tok->decoding_erred) {
1737 result = ERRORTOKEN;
1738 tok->done = E_DECODE;
1739 }
1740 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001741}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001742
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001743/* Get the encoding of a Python file. Check for the coding cookie and check if
1744 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001745
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001746 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1747 encoding in the first or second line of the file (in which case the encoding
1748 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001749
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001750 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1751 by the caller. */
1752
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001753char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001754PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001755{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 struct tok_state *tok;
1757 FILE *fp;
1758 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001759
Victor Stinnerdaf45552013-08-28 00:53:59 +02001760#ifndef PGEN
1761 fd = _Py_dup(fd);
1762#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001763 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001764#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 if (fd < 0) {
1766 return NULL;
1767 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001768
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 fp = fdopen(fd, "r");
1770 if (fp == NULL) {
1771 return NULL;
1772 }
1773 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1774 if (tok == NULL) {
1775 fclose(fp);
1776 return NULL;
1777 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001778#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001779 if (filename != NULL) {
1780 Py_INCREF(filename);
1781 tok->filename = filename;
1782 }
1783 else {
1784 tok->filename = PyUnicode_FromString("<string>");
1785 if (tok->filename == NULL) {
1786 fclose(fp);
1787 PyTokenizer_Free(tok);
1788 return encoding;
1789 }
1790 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001791#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001792 while (tok->lineno < 2 && tok->done == E_OK) {
1793 PyTokenizer_Get(tok, &p_start, &p_end);
1794 }
1795 fclose(fp);
1796 if (tok->encoding) {
1797 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1798 if (encoding)
1799 strcpy(encoding, tok->encoding);
1800 }
1801 PyTokenizer_Free(tok);
1802 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001803}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001804
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001805char *
1806PyTokenizer_FindEncoding(int fd)
1807{
1808 return PyTokenizer_FindEncodingFilename(fd, NULL);
1809}
1810
Guido van Rossum408027e1996-12-30 16:17:54 +00001811#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001812
1813void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001814tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001815{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001816 printf("%s", _PyParser_TokenNames[type]);
1817 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1818 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001819}
1820
1821#endif