blob: 8f0a9c810053a92683e2e7ca96dd4f10bd355cce [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Serhiy Storchakac6792272013-10-19 21:03:34 +030035extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000036/* Return malloc'ed string including trailing \n;
37 empty malloc'ed string for EOF;
38 NULL if interrupted */
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000044static struct tok_state *tok_new(void);
45static int tok_nextc(struct tok_state *tok);
46static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000047
Brett Cannond5ec98c2007-10-20 02:54:14 +000048
Guido van Rossumdcfcd142019-01-31 03:40:27 -080049/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50 tokenizing. */
51static const char* type_comment_prefix = "# type: ";
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Create and initialize a new tok_state structure */
54
55static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000056tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59 sizeof(struct tok_state));
60 if (tok == NULL)
61 return NULL;
62 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
63 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040069
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 tok->atbol = 1;
71 tok->pendin = 0;
72 tok->prompt = tok->nextprompt = NULL;
73 tok->lineno = 0;
74 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 tok->altindstack[0] = 0;
76 tok->decoding_state = STATE_INIT;
77 tok->decoding_erred = 0;
78 tok->read_coding_spec = 0;
79 tok->enc = NULL;
80 tok->encoding = NULL;
81 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020082 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 tok->decoding_readline = NULL;
84 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080085 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030086
Guido van Rossum495da292019-03-07 12:38:08 -080087 tok->async_hacks = 0;
88 tok->async_def = 0;
89 tok->async_def_indent = 0;
90 tok->async_def_nl = 0;
91
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000092 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093}
94
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070096new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000097{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070099 if (!result) {
100 tok->done = E_NOMEM;
101 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700103 memcpy(result, s, len);
104 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000105 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000106}
107
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000108static char *
109error_ret(struct tok_state *tok) /* XXX */
110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000111 tok->decoding_erred = 1;
112 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
113 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200114 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (strncmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
187 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000211
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200212 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700217 if (!get_coding_spec(line, &cs, size, tok))
218 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200219 if (!cs) {
220 Py_ssize_t i;
221 for (i = 0; i < size; i++) {
222 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223 break;
224 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225 /* Stop checking coding spec after a line containing
226 * anything except a comment. */
227 tok->read_coding_spec = 1;
228 break;
229 }
230 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700231 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200232 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 tok->read_coding_spec = 1;
234 if (tok->encoding == NULL) {
235 assert(tok->decoding_state == STATE_RAW);
236 if (strcmp(cs, "utf-8") == 0) {
237 tok->encoding = cs;
238 } else {
239 r = set_readline(tok, cs);
240 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700244 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300245 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 "encoding problem: %s", cs);
247 PyMem_FREE(cs);
248 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 } else { /* then, compare cs with BOM */
251 r = (strcmp(tok->encoding, cs) == 0);
252 if (!r)
253 PyErr_Format(PyExc_SyntaxError,
254 "encoding problem: %s with BOM", cs);
255 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258}
259
260/* See whether the file starts with a BOM. If it does,
261 invoke the set_readline function with the new encoding.
262 Return 1 on success, 0 on failure. */
263
264static int
265check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 void unget_char(int, struct tok_state *),
267 int set_readline(struct tok_state *, const char *),
268 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 int ch1, ch2, ch3;
271 ch1 = get_char(tok);
272 tok->decoding_state = STATE_RAW;
273 if (ch1 == EOF) {
274 return 1;
275 } else if (ch1 == 0xEF) {
276 ch2 = get_char(tok);
277 if (ch2 != 0xBB) {
278 unget_char(ch2, tok);
279 unget_char(ch1, tok);
280 return 1;
281 }
282 ch3 = get_char(tok);
283 if (ch3 != 0xBF) {
284 unget_char(ch3, tok);
285 unget_char(ch2, tok);
286 unget_char(ch1, tok);
287 return 1;
288 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 /* Disable support for UTF-16 BOMs until a decision
291 is made whether this needs to be supported. */
292 } else if (ch1 == 0xFE) {
293 ch2 = get_char(tok);
294 if (ch2 != 0xFF) {
295 unget_char(ch2, tok);
296 unget_char(ch1, tok);
297 return 1;
298 }
299 if (!set_readline(tok, "utf-16-be"))
300 return 0;
301 tok->decoding_state = STATE_NORMAL;
302 } else if (ch1 == 0xFF) {
303 ch2 = get_char(tok);
304 if (ch2 != 0xFE) {
305 unget_char(ch2, tok);
306 unget_char(ch1, tok);
307 return 1;
308 }
309 if (!set_readline(tok, "utf-16-le"))
310 return 0;
311 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000312#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000313 } else {
314 unget_char(ch1, tok);
315 return 1;
316 }
317 if (tok->encoding != NULL)
318 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 tok->encoding = new_string("utf-8", 5, tok);
320 if (!tok->encoding)
321 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 /* No need to set_readline: input is already utf-8 */
323 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000324}
325
326/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000327 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000328
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000329 On entry, tok->decoding_buffer will be one of:
330 1) NULL: need to call tok->decoding_readline to get a new line
331 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000333 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 (in the s buffer) to copy entire contents of the line read
335 by tok->decoding_readline. tok->decoding_buffer has the overflow.
336 In this case, fp_readl is called in a loop (with an expanded buffer)
337 until the buffer ends with a '\n' (or until the end of the file is
338 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000339*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340
341static char *
342fp_readl(char *s, int size, struct tok_state *tok)
343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 PyObject* bufobj;
345 const char *buf;
346 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 /* Ask for one less byte so we can terminate it */
349 assert(size > 0);
350 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 if (tok->decoding_buffer) {
353 bufobj = tok->decoding_buffer;
354 Py_INCREF(bufobj);
355 }
356 else
357 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100358 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 if (bufobj == NULL)
360 goto error;
361 }
362 if (PyUnicode_CheckExact(bufobj))
363 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200364 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 if (buf == NULL) {
366 goto error;
367 }
368 }
369 else
370 {
371 buf = PyByteArray_AsString(bufobj);
372 if (buf == NULL) {
373 goto error;
374 }
375 buflen = PyByteArray_GET_SIZE(bufobj);
376 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 Py_XDECREF(tok->decoding_buffer);
379 if (buflen > size) {
380 /* Too many chars, the rest goes into tok->decoding_buffer */
381 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382 buflen-size);
383 if (tok->decoding_buffer == NULL)
384 goto error;
385 buflen = size;
386 }
387 else
388 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 memcpy(s, buf, buflen);
391 s[buflen] = '\0';
392 if (buflen == 0) /* EOF */
393 s = NULL;
394 Py_DECREF(bufobj);
395 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000396
397error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 Py_XDECREF(bufobj);
399 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400}
401
402/* Set the readline function for TOK to a StreamReader's
403 readline function. The StreamReader is named ENC.
404
405 This function is called from check_bom and check_coding_spec.
406
407 ENC is usually identical to the future value of tok->encoding,
408 except for the (currently unsupported) case of UTF-16.
409
410 Return 1 on success, 0 on failure. */
411
412static int
413fp_setreadl(struct tok_state *tok, const char* enc)
414{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700415 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200416 _Py_IDENTIFIER(open);
417 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000418 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200419 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420
Victor Stinner22a351a2010-10-14 12:04:34 +0000421 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200422 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100423 * position of tok->fp. If tok->fp was opened in text mode on Windows,
424 * its file position counts CRLF as one char and can't be directly mapped
425 * to the file offset for fd. Instead we step back one byte and read to
426 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200427 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100428 if (pos == -1 ||
429 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000430 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700431 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000432 }
433
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700434 io = PyImport_ImportModuleNoBlock("io");
435 if (io == NULL)
436 return 0;
437
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200438 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000439 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700440 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700442 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200444 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700445 Py_DECREF(stream);
446 if (readline == NULL)
447 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300448 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100450 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100451 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700452 if (bufobj == NULL)
453 return 0;
454 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100455 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700457 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458}
459
460/* Fetch the next byte from TOK. */
461
462static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464}
465
466/* Unfetch the last byte back into TOK. */
467
468static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470}
471
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000472/* Check whether the characters at s start a valid
473 UTF-8 sequence. Return the number of characters forming
474 the sequence if yes, 0 if not. */
475static int valid_utf8(const unsigned char* s)
476{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 int expected = 0;
478 int length;
479 if (*s < 0x80)
480 /* single-byte code */
481 return 1;
482 if (*s < 0xc0)
483 /* following byte */
484 return 0;
485 if (*s < 0xE0)
486 expected = 1;
487 else if (*s < 0xF0)
488 expected = 2;
489 else if (*s < 0xF8)
490 expected = 3;
491 else
492 return 0;
493 length = expected + 1;
494 for (; expected; expected--)
495 if (s[expected] < 0x80 || s[expected] >= 0xC0)
496 return 0;
497 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000498}
499
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500/* Read a line of input from TOK. Determine encoding
501 if necessary. */
502
503static char *
504decoding_fgets(char *s, int size, struct tok_state *tok)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 char *line = NULL;
507 int badchar = 0;
508 for (;;) {
509 if (tok->decoding_state == STATE_NORMAL) {
510 /* We already have a codec associated with
511 this input. */
512 line = fp_readl(s, size, tok);
513 break;
514 } else if (tok->decoding_state == STATE_RAW) {
515 /* We want a 'raw' read. */
516 line = Py_UniversalNewlineFgets(s, size,
517 tok->fp, NULL);
518 break;
519 } else {
520 /* We have not yet determined the encoding.
521 If an encoding is found, use the file-pointer
522 reader functions from now on. */
523 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524 return error_ret(tok);
525 assert(tok->decoding_state != STATE_INIT);
526 }
527 }
528 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530 return error_ret(tok);
531 }
532 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000533 /* The default encoding is UTF-8, so make sure we don't have any
534 non-UTF-8 sequences in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 int length;
538 for (c = (unsigned char *)line; *c; c += length)
539 if (!(length = valid_utf8(c))) {
540 badchar = *c;
541 break;
542 }
543 }
544 if (badchar) {
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200547 PyErr_Format(PyExc_SyntaxError,
548 "Non-UTF-8 code starting with '\\x%.2x' "
549 "in file %U on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000553 return error_ret(tok);
554 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000555 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556}
557
558static int
559decoding_feof(struct tok_state *tok)
560{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 if (tok->decoding_state != STATE_NORMAL) {
562 return feof(tok->fp);
563 } else {
564 PyObject* buf = tok->decoding_buffer;
565 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100566 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000567 if (buf == NULL) {
568 error_ret(tok);
569 return 1;
570 } else {
571 tok->decoding_buffer = buf;
572 }
573 }
574 return PyObject_Length(buf) == 0;
575 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576}
577
578/* Fetch a byte from TOK, using the string buffer. */
579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000580static int
581buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000582 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583}
584
585/* Unfetch a byte from TOK, using the string buffer. */
586
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000587static void
588buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000589 tok->str--;
590 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
593/* Set the readline function for TOK to ENC. For the string-based
594 tokenizer, this means to just record the encoding. */
595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000596static int
597buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 tok->enc = enc;
599 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600}
601
602/* Return a UTF-8 encoding Python string object from the
603 C byte string STR, which is encoded with ENC. */
604
605static PyObject *
606translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 PyObject *utf8;
608 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609 if (buf == NULL)
610 return NULL;
611 utf8 = PyUnicode_AsUTF8String(buf);
612 Py_DECREF(buf);
613 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614}
615
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000616
617static char *
618translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200619 int skip_next_lf = 0;
620 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 char *buf, *current;
622 char c = '\0';
623 buf = PyMem_MALLOC(needed_length);
624 if (buf == NULL) {
625 tok->done = E_NOMEM;
626 return NULL;
627 }
628 for (current = buf; *s; s++, current++) {
629 c = *s;
630 if (skip_next_lf) {
631 skip_next_lf = 0;
632 if (c == '\n') {
633 c = *++s;
634 if (!c)
635 break;
636 }
637 }
638 if (c == '\r') {
639 skip_next_lf = 1;
640 c = '\n';
641 }
642 *current = c;
643 }
644 /* If this is exec input, add a newline to the end of the string if
645 there isn't one already. */
646 if (exec_input && c != '\n') {
647 *current = '\n';
648 current++;
649 }
650 *current = '\0';
651 final_length = current - buf + 1;
652 if (final_length < needed_length && final_length)
653 /* should never fail */
654 buf = PyMem_REALLOC(buf, final_length);
655 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000656}
657
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658/* Decode a byte string STR for use as the buffer of TOK.
659 Look for encoding declarations inside STR, and record them
660 inside TOK. */
661
662static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000663decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 PyObject* utf8 = NULL;
666 const char *str;
667 const char *s;
668 const char *newl[2] = {NULL, NULL};
669 int lineno = 0;
670 tok->input = str = translate_newlines(input, single, tok);
671 if (str == NULL)
672 return NULL;
673 tok->enc = NULL;
674 tok->str = str;
675 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
676 return error_ret(tok);
677 str = tok->str; /* string after BOM if any */
678 assert(str);
679 if (tok->enc != NULL) {
680 utf8 = translate_into_utf8(str, tok->enc);
681 if (utf8 == NULL)
682 return error_ret(tok);
683 str = PyBytes_AsString(utf8);
684 }
685 for (s = str;; s++) {
686 if (*s == '\0') break;
687 else if (*s == '\n') {
688 assert(lineno < 2);
689 newl[lineno] = s;
690 lineno++;
691 if (lineno == 2) break;
692 }
693 }
694 tok->enc = NULL;
695 /* need to check line 1 and 2 separately since check_coding_spec
696 assumes a single line as input */
697 if (newl[0]) {
698 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
699 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200700 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000701 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
702 tok, buf_setreadl))
703 return error_ret(tok);
704 }
705 }
706 if (tok->enc != NULL) {
707 assert(utf8 == NULL);
708 utf8 = translate_into_utf8(str, tok->enc);
709 if (utf8 == NULL)
710 return error_ret(tok);
711 str = PyBytes_AS_STRING(utf8);
712 }
713 assert(tok->decoding_buffer == NULL);
714 tok->decoding_buffer = utf8; /* CAUTION */
715 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000716}
717
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000718/* Set up tokenizer for string */
719
720struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000721PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 struct tok_state *tok = tok_new();
724 if (tok == NULL)
725 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300726 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 if (str == NULL) {
728 PyTokenizer_Free(tok);
729 return NULL;
730 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000731
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 /* XXX: constify members. */
733 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
734 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735}
736
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000737struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000738PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000739{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 struct tok_state *tok = tok_new();
741 if (tok == NULL)
742 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 tok->input = str = translate_newlines(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 if (str == NULL) {
745 PyTokenizer_Free(tok);
746 return NULL;
747 }
748 tok->decoding_state = STATE_RAW;
749 tok->read_coding_spec = 1;
750 tok->enc = NULL;
751 tok->str = str;
752 tok->encoding = (char *)PyMem_MALLOC(6);
753 if (!tok->encoding) {
754 PyTokenizer_Free(tok);
755 return NULL;
756 }
757 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000758
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 /* XXX: constify members. */
760 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
761 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000762}
763
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000764/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765
766struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300767PyTokenizer_FromFile(FILE *fp, const char* enc,
768 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000770 struct tok_state *tok = tok_new();
771 if (tok == NULL)
772 return NULL;
773 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
774 PyTokenizer_Free(tok);
775 return NULL;
776 }
777 tok->cur = tok->inp = tok->buf;
778 tok->end = tok->buf + BUFSIZ;
779 tok->fp = fp;
780 tok->prompt = ps1;
781 tok->nextprompt = ps2;
782 if (enc != NULL) {
783 /* Must copy encoding declaration since it
784 gets copied into the parse tree. */
785 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
786 if (!tok->encoding) {
787 PyTokenizer_Free(tok);
788 return NULL;
789 }
790 strcpy(tok->encoding, enc);
791 tok->decoding_state = STATE_NORMAL;
792 }
793 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000794}
795
796
797/* Free a tok_state structure */
798
799void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000800PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000801{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 if (tok->encoding != NULL)
803 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 Py_XDECREF(tok->decoding_readline);
805 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200806 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 if (tok->fp != NULL && tok->buf != NULL)
808 PyMem_FREE(tok->buf);
809 if (tok->input)
810 PyMem_FREE((char *)tok->input);
811 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812}
813
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000814/* Get next char, updating state; error code goes into tok->done */
815
816static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200817tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 for (;;) {
820 if (tok->cur != tok->inp) {
821 return Py_CHARMASK(*tok->cur++); /* Fast path */
822 }
823 if (tok->done != E_OK)
824 return EOF;
825 if (tok->fp == NULL) {
826 char *end = strchr(tok->inp, '\n');
827 if (end != NULL)
828 end++;
829 else {
830 end = strchr(tok->inp, '\0');
831 if (end == tok->inp) {
832 tok->done = E_EOF;
833 return EOF;
834 }
835 }
836 if (tok->start == NULL)
837 tok->buf = tok->cur;
838 tok->line_start = tok->cur;
839 tok->lineno++;
840 tok->inp = end;
841 return Py_CHARMASK(*tok->cur++);
842 }
843 if (tok->prompt != NULL) {
844 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000845 if (newtok != NULL) {
846 char *translated = translate_newlines(newtok, 0, tok);
847 PyMem_FREE(newtok);
848 if (translated == NULL)
849 return EOF;
850 newtok = translated;
851 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 if (tok->encoding && newtok && *newtok) {
853 /* Recode to UTF-8 */
854 Py_ssize_t buflen;
855 const char* buf;
856 PyObject *u = translate_into_utf8(newtok, tok->encoding);
857 PyMem_FREE(newtok);
858 if (!u) {
859 tok->done = E_DECODE;
860 return EOF;
861 }
862 buflen = PyBytes_GET_SIZE(u);
863 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700865 if (newtok == NULL) {
866 Py_DECREF(u);
867 tok->done = E_NOMEM;
868 return EOF;
869 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 strcpy(newtok, buf);
871 Py_DECREF(u);
872 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000873 if (tok->nextprompt != NULL)
874 tok->prompt = tok->nextprompt;
875 if (newtok == NULL)
876 tok->done = E_INTR;
877 else if (*newtok == '\0') {
878 PyMem_FREE(newtok);
879 tok->done = E_EOF;
880 }
881 else if (tok->start != NULL) {
882 size_t start = tok->start - tok->buf;
883 size_t oldlen = tok->cur - tok->buf;
884 size_t newlen = oldlen + strlen(newtok);
885 char *buf = tok->buf;
886 buf = (char *)PyMem_REALLOC(buf, newlen+1);
887 tok->lineno++;
888 if (buf == NULL) {
889 PyMem_FREE(tok->buf);
890 tok->buf = NULL;
891 PyMem_FREE(newtok);
892 tok->done = E_NOMEM;
893 return EOF;
894 }
895 tok->buf = buf;
896 tok->cur = tok->buf + oldlen;
897 tok->line_start = tok->cur;
898 strcpy(tok->buf + oldlen, newtok);
899 PyMem_FREE(newtok);
900 tok->inp = tok->buf + newlen;
901 tok->end = tok->inp + 1;
902 tok->start = tok->buf + start;
903 }
904 else {
905 tok->lineno++;
906 if (tok->buf != NULL)
907 PyMem_FREE(tok->buf);
908 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000909 tok->cur = tok->buf;
910 tok->line_start = tok->buf;
911 tok->inp = strchr(tok->buf, '\0');
912 tok->end = tok->inp + 1;
913 }
914 }
915 else {
916 int done = 0;
917 Py_ssize_t cur = 0;
918 char *pt;
919 if (tok->start == NULL) {
920 if (tok->buf == NULL) {
921 tok->buf = (char *)
922 PyMem_MALLOC(BUFSIZ);
923 if (tok->buf == NULL) {
924 tok->done = E_NOMEM;
925 return EOF;
926 }
927 tok->end = tok->buf + BUFSIZ;
928 }
929 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
930 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200931 if (!tok->decoding_erred)
932 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000933 done = 1;
934 }
935 else {
936 tok->done = E_OK;
937 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700938 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 }
940 }
941 else {
942 cur = tok->cur - tok->buf;
943 if (decoding_feof(tok)) {
944 tok->done = E_EOF;
945 done = 1;
946 }
947 else
948 tok->done = E_OK;
949 }
950 tok->lineno++;
951 /* Read until '\n' or EOF */
952 while (!done) {
953 Py_ssize_t curstart = tok->start == NULL ? -1 :
954 tok->start - tok->buf;
955 Py_ssize_t curvalid = tok->inp - tok->buf;
956 Py_ssize_t newsize = curvalid + BUFSIZ;
957 char *newbuf = tok->buf;
958 newbuf = (char *)PyMem_REALLOC(newbuf,
959 newsize);
960 if (newbuf == NULL) {
961 tok->done = E_NOMEM;
962 tok->cur = tok->inp;
963 return EOF;
964 }
965 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200966 tok->cur = tok->buf + cur;
967 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 tok->inp = tok->buf + curvalid;
969 tok->end = tok->buf + newsize;
970 tok->start = curstart < 0 ? NULL :
971 tok->buf + curstart;
972 if (decoding_fgets(tok->inp,
973 (int)(tok->end - tok->inp),
974 tok) == NULL) {
975 /* Break out early on decoding
976 errors, as tok->buf will be NULL
977 */
978 if (tok->decoding_erred)
979 return EOF;
980 /* Last line does not end in \n,
981 fake one */
982 strcpy(tok->inp, "\n");
983 }
984 tok->inp = strchr(tok->inp, '\0');
985 done = tok->inp[-1] == '\n';
986 }
987 if (tok->buf != NULL) {
988 tok->cur = tok->buf + cur;
989 tok->line_start = tok->cur;
990 /* replace "\r\n" with "\n" */
991 /* For Mac leave the \r, giving a syntax error */
992 pt = tok->inp - 2;
993 if (pt >= tok->buf && *pt == '\r') {
994 *pt++ = '\n';
995 *pt = '\0';
996 tok->inp = pt;
997 }
998 }
999 }
1000 if (tok->done != E_OK) {
1001 if (tok->prompt != NULL)
1002 PySys_WriteStderr("\n");
1003 tok->cur = tok->inp;
1004 return EOF;
1005 }
1006 }
1007 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001008}
1009
1010
1011/* Back-up one character */
1012
1013static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001014tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001015{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 if (c != EOF) {
1017 if (--tok->cur < tok->buf)
1018 Py_FatalError("tok_backup: beginning of buffer");
1019 if (*tok->cur != c)
1020 *tok->cur = c;
1021 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001022}
1023
1024
Guido van Rossum926f13a1998-04-09 21:38:06 +00001025static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001026syntaxerror(struct tok_state *tok, const char *format, ...)
1027{
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001028 va_list vargs;
1029#ifdef HAVE_STDARG_PROTOTYPES
1030 va_start(vargs, format);
1031#else
1032 va_start(vargs);
1033#endif
1034 PyErr_FormatV(PyExc_SyntaxError, format, vargs);
1035 va_end(vargs);
1036 PyErr_SyntaxLocationObject(tok->filename,
1037 tok->lineno,
Victor Stinnerc8846162018-07-21 03:36:06 +02001038 (int)(tok->cur - tok->line_start));
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001039 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001040 return ERRORTOKEN;
1041}
1042
1043static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001044indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001045{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001046 tok->done = E_TABSPACE;
1047 tok->cur = tok->inp;
1048 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001049}
1050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051/* Verify that the identifier follows PEP 3131.
1052 All identifier strings are guaranteed to be "ready" unicode objects.
1053 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001054static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001055verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001056{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 PyObject *s;
1058 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001059 if (tok->decoding_erred)
1060 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001062 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1064 PyErr_Clear();
1065 tok->done = E_IDENTIFIER;
1066 } else {
1067 tok->done = E_ERROR;
1068 }
1069 return 0;
1070 }
1071 result = PyUnicode_IsIdentifier(s);
1072 Py_DECREF(s);
1073 if (result == 0)
1074 tok->done = E_IDENTIFIER;
1075 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001076}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001077
Brett Cannona721aba2016-09-09 14:57:09 -07001078static int
1079tok_decimal_tail(struct tok_state *tok)
1080{
1081 int c;
1082
1083 while (1) {
1084 do {
1085 c = tok_nextc(tok);
1086 } while (isdigit(c));
1087 if (c != '_') {
1088 break;
1089 }
1090 c = tok_nextc(tok);
1091 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001092 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001093 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001094 return 0;
1095 }
1096 }
1097 return c;
1098}
1099
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001100/* Get next token, after space stripping etc. */
1101
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001102static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001103tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001105 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001107
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001109 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 tok->start = NULL;
1111 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001112
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001113 /* Get indentation level */
1114 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001115 int col = 0;
1116 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 tok->atbol = 0;
1118 for (;;) {
1119 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001120 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001122 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001124 col = (col / tok->tabsize + 1) * tok->tabsize;
1125 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 }
Brett Cannona721aba2016-09-09 14:57:09 -07001127 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001129 }
1130 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001132 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001133 }
1134 tok_backup(tok, c);
1135 if (c == '#' || c == '\n') {
1136 /* Lines with only whitespace and/or comments
1137 shouldn't affect the indentation and are
1138 not passed to the parser as NEWLINE tokens,
1139 except *totally* empty lines in interactive
1140 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001141 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001143 }
1144 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001146 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 /* We can't jump back right here since we still
1148 may need to skip to the end of a comment */
1149 }
1150 if (!blankline && tok->level == 0) {
1151 if (col == tok->indstack[tok->indent]) {
1152 /* No change */
1153 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001154 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 }
1156 }
1157 else if (col > tok->indstack[tok->indent]) {
1158 /* Indent -- always one */
1159 if (tok->indent+1 >= MAXINDENT) {
1160 tok->done = E_TOODEEP;
1161 tok->cur = tok->inp;
1162 return ERRORTOKEN;
1163 }
1164 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001165 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 }
1167 tok->pendin++;
1168 tok->indstack[++tok->indent] = col;
1169 tok->altindstack[tok->indent] = altcol;
1170 }
1171 else /* col < tok->indstack[tok->indent] */ {
1172 /* Dedent -- any number, must be consistent */
1173 while (tok->indent > 0 &&
1174 col < tok->indstack[tok->indent]) {
1175 tok->pendin--;
1176 tok->indent--;
1177 }
1178 if (col != tok->indstack[tok->indent]) {
1179 tok->done = E_DEDENT;
1180 tok->cur = tok->inp;
1181 return ERRORTOKEN;
1182 }
1183 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001184 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 }
1186 }
1187 }
1188 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001189
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001191
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001192 /* Return pending indents/dedents */
1193 if (tok->pendin != 0) {
1194 if (tok->pendin < 0) {
1195 tok->pendin++;
1196 return DEDENT;
1197 }
1198 else {
1199 tok->pendin--;
1200 return INDENT;
1201 }
1202 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001203
Guido van Rossum495da292019-03-07 12:38:08 -08001204 /* Peek ahead at the next character */
1205 c = tok_nextc(tok);
1206 tok_backup(tok, c);
1207 /* Check if we are closing an async function */
1208 if (tok->async_def
1209 && !blankline
1210 /* Due to some implementation artifacts of type comments,
1211 * a TYPE_COMMENT at the start of a function won't set an
1212 * indentation level and it will produce a NEWLINE after it.
1213 * To avoid spuriously ending an async function due to this,
1214 * wait until we have some non-newline char in front of us. */
1215 && c != '\n'
1216 && tok->level == 0
1217 /* There was a NEWLINE after ASYNC DEF,
1218 so we're past the signature. */
1219 && tok->async_def_nl
1220 /* Current indentation level is less than where
1221 the async function was defined */
1222 && tok->async_def_indent >= tok->indent)
1223 {
1224 tok->async_def = 0;
1225 tok->async_def_indent = 0;
1226 tok->async_def_nl = 0;
1227 }
1228
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 tok->start = NULL;
1231 /* Skip spaces */
1232 do {
1233 c = tok_nextc(tok);
1234 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001235
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001236 /* Set start of current token */
1237 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001238
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001239 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001240 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001241 const char *prefix, *p, *type_start;
1242
Brett Cannona721aba2016-09-09 14:57:09 -07001243 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001245 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001246
1247 if (tok->type_comments) {
1248 p = tok->start;
1249 prefix = type_comment_prefix;
1250 while (*prefix && p < tok->cur) {
1251 if (*prefix == ' ') {
1252 while (*p == ' ' || *p == '\t') {
1253 p++;
1254 }
1255 } else if (*prefix == *p) {
1256 p++;
1257 } else {
1258 break;
1259 }
1260
1261 prefix++;
1262 }
1263
1264 /* This is a type comment if we matched all of type_comment_prefix. */
1265 if (!*prefix) {
1266 int is_type_ignore = 1;
1267 tok_backup(tok, c); /* don't eat the newline or EOF */
1268
1269 type_start = p;
1270
1271 is_type_ignore = tok->cur >= p + 6 && memcmp(p, "ignore", 6) == 0;
1272 p += 6;
1273 while (is_type_ignore && p < tok->cur) {
1274 if (*p == '#')
1275 break;
1276 is_type_ignore = is_type_ignore && (*p == ' ' || *p == '\t');
1277 p++;
1278 }
1279
1280 if (is_type_ignore) {
1281 /* If this type ignore is the only thing on the line, consume the newline also. */
1282 if (blankline) {
1283 tok_nextc(tok);
1284 tok->atbol = 1;
1285 }
1286 return TYPE_IGNORE;
1287 } else {
1288 *p_start = (char *) type_start; /* after type_comment_prefix */
1289 *p_end = tok->cur;
1290 return TYPE_COMMENT;
1291 }
1292 }
1293 }
Brett Cannona721aba2016-09-09 14:57:09 -07001294 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001295
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 /* Check for EOF and errors now */
1297 if (c == EOF) {
1298 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1299 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001300
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301 /* Identifier (most frequent token!) */
1302 nonascii = 0;
1303 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001304 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001305 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001306 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001307 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001308 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001309 /* Since this is a backwards compatibility support literal we don't
1310 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001311 else if (!(saw_b || saw_u || saw_r || saw_f)
1312 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001313 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001314 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001315 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001316 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001317 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001318 }
1319 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001320 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001321 }
1322 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001323 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001324 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001326 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001327 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001328 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 }
1330 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001331 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001333 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 c = tok_nextc(tok);
1335 }
1336 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001337 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001339 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 *p_start = tok->start;
1341 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001342
Guido van Rossum495da292019-03-07 12:38:08 -08001343 /* async/await parsing block. */
1344 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1345 /* May be an 'async' or 'await' token. For Python 3.7 or
1346 later we recognize them unconditionally. For Python
1347 3.5 or 3.6 we recognize 'async' in front of 'def', and
1348 either one inside of 'async def'. (Technically we
1349 shouldn't recognize these at all for 3.4 or earlier,
1350 but there's no *valid* Python 3.4 code that would be
1351 rejected, and async functions will be rejected in a
1352 later phase.) */
1353 if (!tok->async_hacks || tok->async_def) {
1354 /* Always recognize the keywords. */
1355 if (memcmp(tok->start, "async", 5) == 0) {
1356 return ASYNC;
1357 }
1358 if (memcmp(tok->start, "await", 5) == 0) {
1359 return AWAIT;
1360 }
1361 }
1362 else if (memcmp(tok->start, "async", 5) == 0) {
1363 /* The current token is 'async'.
1364 Look ahead one token to see if that is 'def'. */
1365
1366 struct tok_state ahead_tok;
1367 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1368 int ahead_tok_kind;
1369
1370 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1371 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1372 &ahead_tok_end);
1373
1374 if (ahead_tok_kind == NAME
1375 && ahead_tok.cur - ahead_tok.start == 3
1376 && memcmp(ahead_tok.start, "def", 3) == 0)
1377 {
1378 /* The next token is going to be 'def', so instead of
1379 returning a plain NAME token, return ASYNC. */
1380 tok->async_def_indent = tok->indent;
1381 tok->async_def = 1;
1382 return ASYNC;
1383 }
1384 }
1385 }
1386
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 return NAME;
1388 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 /* Newline */
1391 if (c == '\n') {
1392 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001393 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001395 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 *p_start = tok->start;
1397 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1398 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001399 if (tok->async_def) {
1400 /* We're somewhere inside an 'async def' function, and
1401 we've encountered a NEWLINE after its signature. */
1402 tok->async_def_nl = 1;
1403 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 return NEWLINE;
1405 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 /* Period or number starting with period? */
1408 if (c == '.') {
1409 c = tok_nextc(tok);
1410 if (isdigit(c)) {
1411 goto fraction;
1412 } else if (c == '.') {
1413 c = tok_nextc(tok);
1414 if (c == '.') {
1415 *p_start = tok->start;
1416 *p_end = tok->cur;
1417 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001418 }
1419 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001420 tok_backup(tok, c);
1421 }
1422 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001423 }
1424 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 tok_backup(tok, c);
1426 }
1427 *p_start = tok->start;
1428 *p_end = tok->cur;
1429 return DOT;
1430 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001431
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 /* Number */
1433 if (isdigit(c)) {
1434 if (c == '0') {
1435 /* Hex, octal or binary -- maybe. */
1436 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 /* Hex */
1439 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001441 if (c == '_') {
1442 c = tok_nextc(tok);
1443 }
1444 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001445 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001446 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001447 }
1448 do {
1449 c = tok_nextc(tok);
1450 } while (isxdigit(c));
1451 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 }
1453 else if (c == 'o' || c == 'O') {
1454 /* Octal */
1455 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001457 if (c == '_') {
1458 c = tok_nextc(tok);
1459 }
1460 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001461 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001462 if (isdigit(c)) {
1463 return syntaxerror(tok,
1464 "invalid digit '%c' in octal literal", c);
1465 }
1466 else {
1467 return syntaxerror(tok, "invalid octal literal");
1468 }
Brett Cannona721aba2016-09-09 14:57:09 -07001469 }
1470 do {
1471 c = tok_nextc(tok);
1472 } while ('0' <= c && c < '8');
1473 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001474 if (isdigit(c)) {
1475 return syntaxerror(tok,
1476 "invalid digit '%c' in octal literal", c);
1477 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 }
1479 else if (c == 'b' || c == 'B') {
1480 /* Binary */
1481 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001483 if (c == '_') {
1484 c = tok_nextc(tok);
1485 }
1486 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001487 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001488 if (isdigit(c)) {
1489 return syntaxerror(tok,
1490 "invalid digit '%c' in binary literal", c);
1491 }
1492 else {
1493 return syntaxerror(tok, "invalid binary literal");
1494 }
Brett Cannona721aba2016-09-09 14:57:09 -07001495 }
1496 do {
1497 c = tok_nextc(tok);
1498 } while (c == '0' || c == '1');
1499 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001500 if (isdigit(c)) {
1501 return syntaxerror(tok,
1502 "invalid digit '%c' in binary literal", c);
1503 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 }
1505 else {
1506 int nonzero = 0;
1507 /* maybe old-style octal; c is first char of it */
1508 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001509 while (1) {
1510 if (c == '_') {
1511 c = tok_nextc(tok);
1512 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001513 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001514 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001515 }
1516 }
1517 if (c != '0') {
1518 break;
1519 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 c = tok_nextc(tok);
1521 }
Brett Cannona721aba2016-09-09 14:57:09 -07001522 if (isdigit(c)) {
1523 nonzero = 1;
1524 c = tok_decimal_tail(tok);
1525 if (c == 0) {
1526 return ERRORTOKEN;
1527 }
1528 }
1529 if (c == '.') {
1530 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001532 }
1533 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001535 }
1536 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001538 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001540 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001541 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001542 return syntaxerror(tok,
1543 "leading zeros in decimal integer "
1544 "literals are not permitted; "
1545 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546 }
1547 }
1548 }
1549 else {
1550 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001551 c = tok_decimal_tail(tok);
1552 if (c == 0) {
1553 return ERRORTOKEN;
1554 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 {
1556 /* Accept floating point numbers. */
1557 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001558 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 fraction:
1560 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001561 if (isdigit(c)) {
1562 c = tok_decimal_tail(tok);
1563 if (c == 0) {
1564 return ERRORTOKEN;
1565 }
1566 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 }
1568 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001569 int e;
1570 exponent:
1571 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001572 /* Exponent part */
1573 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001574 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001576 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001577 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001578 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001579 }
1580 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001581 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001582 tok_backup(tok, e);
1583 *p_start = tok->start;
1584 *p_end = tok->cur;
1585 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001586 }
Brett Cannona721aba2016-09-09 14:57:09 -07001587 c = tok_decimal_tail(tok);
1588 if (c == 0) {
1589 return ERRORTOKEN;
1590 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 }
Brett Cannona721aba2016-09-09 14:57:09 -07001592 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 /* Imaginary part */
1594 imaginary:
1595 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001596 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001597 }
1598 }
1599 tok_backup(tok, c);
1600 *p_start = tok->start;
1601 *p_end = tok->cur;
1602 return NUMBER;
1603 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001604
1605 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606 /* String */
1607 if (c == '\'' || c == '"') {
1608 int quote = c;
1609 int quote_size = 1; /* 1 or 3 */
1610 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001611
Anthony Sottile995d9b92019-01-12 20:05:13 -08001612 /* Nodes of type STRING, especially multi line strings
1613 must be handled differently in order to get both
1614 the starting line number and the column offset right.
1615 (cf. issue 16806) */
1616 tok->first_lineno = tok->lineno;
1617 tok->multi_line_start = tok->line_start;
1618
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001619 /* Find the quote size and start of string */
1620 c = tok_nextc(tok);
1621 if (c == quote) {
1622 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001623 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001625 }
1626 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001628 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 }
Brett Cannona721aba2016-09-09 14:57:09 -07001630 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001632 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001633
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 /* Get rest of string */
1635 while (end_quote_size != quote_size) {
1636 c = tok_nextc(tok);
1637 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001638 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001639 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001640 }
1641 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001643 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 tok->cur = tok->inp;
1645 return ERRORTOKEN;
1646 }
1647 if (quote_size == 1 && c == '\n') {
1648 tok->done = E_EOLS;
1649 tok->cur = tok->inp;
1650 return ERRORTOKEN;
1651 }
Brett Cannona721aba2016-09-09 14:57:09 -07001652 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001654 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 else {
1656 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001657 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001658 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001659 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 }
1661 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001662
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 *p_start = tok->start;
1664 *p_end = tok->cur;
1665 return STRING;
1666 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001667
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 /* Line continuation */
1669 if (c == '\\') {
1670 c = tok_nextc(tok);
1671 if (c != '\n') {
1672 tok->done = E_LINECONT;
1673 tok->cur = tok->inp;
1674 return ERRORTOKEN;
1675 }
1676 tok->cont_line = 1;
1677 goto again; /* Read next line */
1678 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001679
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680 /* Check for two-character token */
1681 {
1682 int c2 = tok_nextc(tok);
1683 int token = PyToken_TwoChars(c, c2);
1684 if (token != OP) {
1685 int c3 = tok_nextc(tok);
1686 int token3 = PyToken_ThreeChars(c, c2, c3);
1687 if (token3 != OP) {
1688 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001689 }
1690 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691 tok_backup(tok, c3);
1692 }
1693 *p_start = tok->start;
1694 *p_end = tok->cur;
1695 return token;
1696 }
1697 tok_backup(tok, c2);
1698 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001699
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 /* Keep track of parentheses nesting level */
1701 switch (c) {
1702 case '(':
1703 case '[':
1704 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001705 if (tok->level >= MAXLEVEL) {
1706 return syntaxerror(tok, "too many nested parentheses");
1707 }
1708 tok->parenstack[tok->level] = c;
1709 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 tok->level++;
1711 break;
1712 case ')':
1713 case ']':
1714 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001715 if (!tok->level) {
1716 return syntaxerror(tok, "unmatched '%c'", c);
1717 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001719 int opening = tok->parenstack[tok->level];
1720 if (!((opening == '(' && c == ')') ||
1721 (opening == '[' && c == ']') ||
1722 (opening == '{' && c == '}')))
1723 {
1724 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1725 return syntaxerror(tok,
1726 "closing parenthesis '%c' does not match "
1727 "opening parenthesis '%c' on line %d",
1728 c, opening, tok->parenlinenostack[tok->level]);
1729 }
1730 else {
1731 return syntaxerror(tok,
1732 "closing parenthesis '%c' does not match "
1733 "opening parenthesis '%c'",
1734 c, opening);
1735 }
1736 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 break;
1738 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001739
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 /* Punctuation character */
1741 *p_start = tok->start;
1742 *p_end = tok->cur;
1743 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001744}
1745
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001746int
1747PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1748{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 int result = tok_get(tok, p_start, p_end);
1750 if (tok->decoding_erred) {
1751 result = ERRORTOKEN;
1752 tok->done = E_DECODE;
1753 }
1754 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001755}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001756
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001757/* Get the encoding of a Python file. Check for the coding cookie and check if
1758 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001759
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001760 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1761 encoding in the first or second line of the file (in which case the encoding
1762 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001763
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001764 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1765 by the caller. */
1766
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001767char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001768PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001769{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001770 struct tok_state *tok;
1771 FILE *fp;
1772 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001773
Victor Stinnerdaf45552013-08-28 00:53:59 +02001774 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001775 if (fd < 0) {
1776 return NULL;
1777 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001778
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 fp = fdopen(fd, "r");
1780 if (fp == NULL) {
1781 return NULL;
1782 }
1783 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1784 if (tok == NULL) {
1785 fclose(fp);
1786 return NULL;
1787 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001788 if (filename != NULL) {
1789 Py_INCREF(filename);
1790 tok->filename = filename;
1791 }
1792 else {
1793 tok->filename = PyUnicode_FromString("<string>");
1794 if (tok->filename == NULL) {
1795 fclose(fp);
1796 PyTokenizer_Free(tok);
1797 return encoding;
1798 }
1799 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001800 while (tok->lineno < 2 && tok->done == E_OK) {
1801 PyTokenizer_Get(tok, &p_start, &p_end);
1802 }
1803 fclose(fp);
1804 if (tok->encoding) {
1805 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1806 if (encoding)
1807 strcpy(encoding, tok->encoding);
1808 }
1809 PyTokenizer_Free(tok);
1810 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001811}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001812
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001813char *
1814PyTokenizer_FindEncoding(int fd)
1815{
1816 return PyTokenizer_FindEncodingFilename(fd, NULL);
1817}
1818
Guido van Rossum408027e1996-12-30 16:17:54 +00001819#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001820
1821void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001822tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001823{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001824 printf("%s", _PyParser_TokenNames[type]);
1825 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1826 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001827}
1828
1829#endif