blob: 62cd2966231b8a30ba864bbad759f3cc2b17d7b2 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035
Guido van Rossum4fe87291992-02-26 15:24:44 +000036/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038
Guido van Rossum3f5da241990-12-20 15:06:42 +000039/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000040static struct tok_state *tok_new(void);
41static int tok_nextc(struct tok_state *tok);
42static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000043
Brett Cannond5ec98c2007-10-20 02:54:14 +000044
Guido van Rossumdcfcd142019-01-31 03:40:27 -080045/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47static const char* type_comment_prefix = "# type: ";
48
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000049/* Create and initialize a new tok_state structure */
50
51static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000052tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053{
Victor Stinner00d7abd2020-12-01 09:56:42 +010054 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000055 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060058 tok->buf = tok->cur = tok->inp = NULL;
59 tok->start = NULL;
60 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000061 tok->done = E_OK;
62 tok->fp = NULL;
63 tok->input = NULL;
64 tok->tabsize = TABSIZE;
65 tok->indent = 0;
66 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040067
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000068 tok->atbol = 1;
69 tok->pendin = 0;
70 tok->prompt = tok->nextprompt = NULL;
71 tok->lineno = 0;
72 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000073 tok->altindstack[0] = 0;
74 tok->decoding_state = STATE_INIT;
75 tok->decoding_erred = 0;
76 tok->read_coding_spec = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +020084 tok->stdin_content = NULL;
Yury Selivanov96ec9342015-07-23 15:01:58 +030085
Guido van Rossum495da292019-03-07 12:38:08 -080086 tok->async_hacks = 0;
87 tok->async_def = 0;
88 tok->async_def_indent = 0;
89 tok->async_def_nl = 0;
90
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000092}
93
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000094static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070095new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000096{
Victor Stinner00d7abd2020-12-01 09:56:42 +010097 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098 if (!result) {
99 tok->done = E_NOMEM;
100 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700102 memcpy(result, s, len);
103 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000105}
106
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000107static char *
108error_ret(struct tok_state *tok) /* XXX */
109{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000110 tok->decoding_erred = 1;
111 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100112 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600113 tok->buf = tok->cur = tok->inp = NULL;
114 tok->start = NULL;
115 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200116 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000117 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000118}
119
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000120
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200121static const char *
122get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000123{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000124 char buf[13];
125 int i;
126 for (i = 0; i < 12; i++) {
127 int c = s[i];
128 if (c == '\0')
129 break;
130 else if (c == '_')
131 buf[i] = '-';
132 else
133 buf[i] = tolower(c);
134 }
135 buf[i] = '\0';
136 if (strcmp(buf, "utf-8") == 0 ||
137 strncmp(buf, "utf-8-", 6) == 0)
138 return "utf-8";
139 else if (strcmp(buf, "latin-1") == 0 ||
140 strcmp(buf, "iso-8859-1") == 0 ||
141 strcmp(buf, "iso-latin-1") == 0 ||
142 strncmp(buf, "latin-1-", 8) == 0 ||
143 strncmp(buf, "iso-8859-1-", 11) == 0 ||
144 strncmp(buf, "iso-latin-1-", 12) == 0)
145 return "iso-8859-1";
146 else
147 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148}
149
150/* Return the coding spec in S, or NULL if none is found. */
151
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700152static int
153get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000154{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700156 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000157 /* Coding spec must be in a comment, and that comment must be
158 * the only statement on the source code line. */
159 for (i = 0; i < size - 6; i++) {
160 if (s[i] == '#')
161 break;
162 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700163 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000164 }
165 for (; i < size - 6; i++) { /* XXX inefficient search */
166 const char* t = s + i;
167 if (strncmp(t, "coding", 6) == 0) {
168 const char* begin = NULL;
169 t += 6;
170 if (t[0] != ':' && t[0] != '=')
171 continue;
172 do {
173 t++;
174 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 begin = t;
177 while (Py_ISALNUM(t[0]) ||
178 t[0] == '-' || t[0] == '_' || t[0] == '.')
179 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000181 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700182 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200183 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700184 if (!r)
185 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700186 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100188 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700189 r = new_string(q, strlen(q), tok);
190 if (!r)
191 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700193 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200194 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
196 }
197 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700198 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000199}
200
201/* Check whether the line contains a coding spec. If it does,
202 invoke the set_readline function for the new encoding.
203 This function receives the tok_state and the new encoding.
204 Return 1 on success, 0 on failure. */
205
206static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000207check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000208 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700210 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000212
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200213 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200215 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000216 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200217 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700218 if (!get_coding_spec(line, &cs, size, tok))
219 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200220 if (!cs) {
221 Py_ssize_t i;
222 for (i = 0; i < size; i++) {
223 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
224 break;
225 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
226 /* Stop checking coding spec after a line containing
227 * anything except a comment. */
228 tok->read_coding_spec = 1;
229 break;
230 }
231 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700232 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200233 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700234 tok->read_coding_spec = 1;
235 if (tok->encoding == NULL) {
236 assert(tok->decoding_state == STATE_RAW);
237 if (strcmp(cs, "utf-8") == 0) {
238 tok->encoding = cs;
239 } else {
240 r = set_readline(tok, cs);
241 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700243 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300246 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 "encoding problem: %s", cs);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100248 PyMem_Free(cs);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700249 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700251 } else { /* then, compare cs with BOM */
252 r = (strcmp(tok->encoding, cs) == 0);
253 if (!r)
254 PyErr_Format(PyExc_SyntaxError,
255 "encoding problem: %s with BOM", cs);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100256 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259}
260
261/* See whether the file starts with a BOM. If it does,
262 invoke the set_readline function with the new encoding.
263 Return 1 on success, 0 on failure. */
264
265static int
266check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 void unget_char(int, struct tok_state *),
268 int set_readline(struct tok_state *, const char *),
269 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000270{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 int ch1, ch2, ch3;
272 ch1 = get_char(tok);
273 tok->decoding_state = STATE_RAW;
274 if (ch1 == EOF) {
275 return 1;
276 } else if (ch1 == 0xEF) {
277 ch2 = get_char(tok);
278 if (ch2 != 0xBB) {
279 unget_char(ch2, tok);
280 unget_char(ch1, tok);
281 return 1;
282 }
283 ch3 = get_char(tok);
284 if (ch3 != 0xBF) {
285 unget_char(ch3, tok);
286 unget_char(ch2, tok);
287 unget_char(ch1, tok);
288 return 1;
289 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000291 /* Disable support for UTF-16 BOMs until a decision
292 is made whether this needs to be supported. */
293 } else if (ch1 == 0xFE) {
294 ch2 = get_char(tok);
295 if (ch2 != 0xFF) {
296 unget_char(ch2, tok);
297 unget_char(ch1, tok);
298 return 1;
299 }
300 if (!set_readline(tok, "utf-16-be"))
301 return 0;
302 tok->decoding_state = STATE_NORMAL;
303 } else if (ch1 == 0xFF) {
304 ch2 = get_char(tok);
305 if (ch2 != 0xFE) {
306 unget_char(ch2, tok);
307 unget_char(ch1, tok);
308 return 1;
309 }
310 if (!set_readline(tok, "utf-16-le"))
311 return 0;
312 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000313#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 } else {
315 unget_char(ch1, tok);
316 return 1;
317 }
318 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100319 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 tok->encoding = new_string("utf-8", 5, tok);
321 if (!tok->encoding)
322 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 /* No need to set_readline: input is already utf-8 */
324 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000325}
326
327/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000328 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000329
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000330 On entry, tok->decoding_buffer will be one of:
331 1) NULL: need to call tok->decoding_readline to get a new line
332 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000333 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000334 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000335 (in the s buffer) to copy entire contents of the line read
336 by tok->decoding_readline. tok->decoding_buffer has the overflow.
337 In this case, fp_readl is called in a loop (with an expanded buffer)
338 until the buffer ends with a '\n' (or until the end of the file is
339 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000340*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341
342static char *
343fp_readl(char *s, int size, struct tok_state *tok)
344{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000345 PyObject* bufobj;
346 const char *buf;
347 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000349 /* Ask for one less byte so we can terminate it */
350 assert(size > 0);
351 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000353 if (tok->decoding_buffer) {
354 bufobj = tok->decoding_buffer;
355 Py_INCREF(bufobj);
356 }
357 else
358 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100359 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 if (bufobj == NULL)
361 goto error;
362 }
363 if (PyUnicode_CheckExact(bufobj))
364 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200365 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 if (buf == NULL) {
367 goto error;
368 }
369 }
370 else
371 {
372 buf = PyByteArray_AsString(bufobj);
373 if (buf == NULL) {
374 goto error;
375 }
376 buflen = PyByteArray_GET_SIZE(bufobj);
377 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 Py_XDECREF(tok->decoding_buffer);
380 if (buflen > size) {
381 /* Too many chars, the rest goes into tok->decoding_buffer */
382 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
383 buflen-size);
384 if (tok->decoding_buffer == NULL)
385 goto error;
386 buflen = size;
387 }
388 else
389 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000390
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000391 memcpy(s, buf, buflen);
392 s[buflen] = '\0';
393 if (buflen == 0) /* EOF */
394 s = NULL;
395 Py_DECREF(bufobj);
396 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000397
398error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399 Py_XDECREF(bufobj);
400 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000401}
402
403/* Set the readline function for TOK to a StreamReader's
404 readline function. The StreamReader is named ENC.
405
406 This function is called from check_bom and check_coding_spec.
407
408 ENC is usually identical to the future value of tok->encoding,
409 except for the (currently unsupported) case of UTF-16.
410
411 Return 1 on success, 0 on failure. */
412
413static int
414fp_setreadl(struct tok_state *tok, const char* enc)
415{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700416 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200417 _Py_IDENTIFIER(open);
418 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000419 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200420 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000421
Victor Stinner22a351a2010-10-14 12:04:34 +0000422 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200423 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100424 * position of tok->fp. If tok->fp was opened in text mode on Windows,
425 * its file position counts CRLF as one char and can't be directly mapped
426 * to the file offset for fd. Instead we step back one byte and read to
427 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200428 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100429 if (pos == -1 ||
430 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000431 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700432 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000433 }
434
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700435 io = PyImport_ImportModuleNoBlock("io");
436 if (io == NULL)
437 return 0;
438
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200439 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000440 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700441 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000442 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700443 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200445 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700446 Py_DECREF(stream);
447 if (readline == NULL)
448 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300449 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700450
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100451 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100452 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700453 if (bufobj == NULL)
454 return 0;
455 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100456 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000457
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700458 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459}
460
461/* Fetch the next byte from TOK. */
462
463static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465}
466
467/* Unfetch the last byte back into TOK. */
468
469static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471}
472
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000473/* Check whether the characters at s start a valid
474 UTF-8 sequence. Return the number of characters forming
475 the sequence if yes, 0 if not. */
476static int valid_utf8(const unsigned char* s)
477{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000478 int expected = 0;
479 int length;
480 if (*s < 0x80)
481 /* single-byte code */
482 return 1;
483 if (*s < 0xc0)
484 /* following byte */
485 return 0;
486 if (*s < 0xE0)
487 expected = 1;
488 else if (*s < 0xF0)
489 expected = 2;
490 else if (*s < 0xF8)
491 expected = 3;
492 else
493 return 0;
494 length = expected + 1;
495 for (; expected; expected--)
496 if (s[expected] < 0x80 || s[expected] >= 0xC0)
497 return 0;
498 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000499}
500
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000501/* Read a line of input from TOK. Determine encoding
502 if necessary. */
503
504static char *
505decoding_fgets(char *s, int size, struct tok_state *tok)
506{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507 char *line = NULL;
508 int badchar = 0;
509 for (;;) {
510 if (tok->decoding_state == STATE_NORMAL) {
511 /* We already have a codec associated with
512 this input. */
513 line = fp_readl(s, size, tok);
514 break;
515 } else if (tok->decoding_state == STATE_RAW) {
516 /* We want a 'raw' read. */
517 line = Py_UniversalNewlineFgets(s, size,
518 tok->fp, NULL);
519 break;
520 } else {
521 /* We have not yet determined the encoding.
522 If an encoding is found, use the file-pointer
523 reader functions from now on. */
524 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
525 return error_ret(tok);
526 assert(tok->decoding_state != STATE_INIT);
527 }
528 }
529 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
530 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
531 return error_ret(tok);
532 }
533 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000534 /* The default encoding is UTF-8, so make sure we don't have any
535 non-UTF-8 sequences in it. */
536 if (line && !tok->encoding) {
537 unsigned char *c;
538 int length;
539 for (c = (unsigned char *)line; *c; c += length)
540 if (!(length = valid_utf8(c))) {
541 badchar = *c;
542 break;
543 }
544 }
545 if (badchar) {
546 /* Need to add 1 to the line number, since this line
547 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200548 PyErr_Format(PyExc_SyntaxError,
549 "Non-UTF-8 code starting with '\\x%.2x' "
550 "in file %U on line %i, "
551 "but no encoding declared; "
552 "see http://python.org/dev/peps/pep-0263/ for details",
553 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000554 return error_ret(tok);
555 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000556 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557}
558
559static int
560decoding_feof(struct tok_state *tok)
561{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000562 if (tok->decoding_state != STATE_NORMAL) {
563 return feof(tok->fp);
564 } else {
565 PyObject* buf = tok->decoding_buffer;
566 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100567 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000568 if (buf == NULL) {
569 error_ret(tok);
570 return 1;
571 } else {
572 tok->decoding_buffer = buf;
573 }
574 }
575 return PyObject_Length(buf) == 0;
576 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577}
578
579/* Fetch a byte from TOK, using the string buffer. */
580
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581static int
582buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000583 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584}
585
586/* Unfetch a byte from TOK, using the string buffer. */
587
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588static void
589buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000590 tok->str--;
591 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592}
593
594/* Set the readline function for TOK to ENC. For the string-based
595 tokenizer, this means to just record the encoding. */
596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000597static int
598buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000599 tok->enc = enc;
600 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Return a UTF-8 encoding Python string object from the
604 C byte string STR, which is encoded with ENC. */
605
606static PyObject *
607translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 PyObject *utf8;
609 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
610 if (buf == NULL)
611 return NULL;
612 utf8 = PyUnicode_AsUTF8String(buf);
613 Py_DECREF(buf);
614 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615}
616
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000617
618static char *
619translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200620 int skip_next_lf = 0;
621 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000622 char *buf, *current;
623 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100624 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 if (buf == NULL) {
626 tok->done = E_NOMEM;
627 return NULL;
628 }
629 for (current = buf; *s; s++, current++) {
630 c = *s;
631 if (skip_next_lf) {
632 skip_next_lf = 0;
633 if (c == '\n') {
634 c = *++s;
635 if (!c)
636 break;
637 }
638 }
639 if (c == '\r') {
640 skip_next_lf = 1;
641 c = '\n';
642 }
643 *current = c;
644 }
645 /* If this is exec input, add a newline to the end of the string if
646 there isn't one already. */
647 if (exec_input && c != '\n') {
648 *current = '\n';
649 current++;
650 }
651 *current = '\0';
652 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000653 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100655 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000656 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100657 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000658 }
659 buf = result;
660 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000662}
663
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664/* Decode a byte string STR for use as the buffer of TOK.
665 Look for encoding declarations inside STR, and record them
666 inside TOK. */
667
Andy Lester384f3c52020-02-27 20:44:52 -0600668static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000669decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000671 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600672 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 const char *s;
674 const char *newl[2] = {NULL, NULL};
675 int lineno = 0;
676 tok->input = str = translate_newlines(input, single, tok);
677 if (str == NULL)
678 return NULL;
679 tok->enc = NULL;
680 tok->str = str;
681 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
682 return error_ret(tok);
683 str = tok->str; /* string after BOM if any */
684 assert(str);
685 if (tok->enc != NULL) {
686 utf8 = translate_into_utf8(str, tok->enc);
687 if (utf8 == NULL)
688 return error_ret(tok);
689 str = PyBytes_AsString(utf8);
690 }
691 for (s = str;; s++) {
692 if (*s == '\0') break;
693 else if (*s == '\n') {
694 assert(lineno < 2);
695 newl[lineno] = s;
696 lineno++;
697 if (lineno == 2) break;
698 }
699 }
700 tok->enc = NULL;
701 /* need to check line 1 and 2 separately since check_coding_spec
702 assumes a single line as input */
703 if (newl[0]) {
704 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
705 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200706 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000707 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
708 tok, buf_setreadl))
709 return error_ret(tok);
710 }
711 }
712 if (tok->enc != NULL) {
713 assert(utf8 == NULL);
714 utf8 = translate_into_utf8(str, tok->enc);
715 if (utf8 == NULL)
716 return error_ret(tok);
717 str = PyBytes_AS_STRING(utf8);
718 }
719 assert(tok->decoding_buffer == NULL);
720 tok->decoding_buffer = utf8; /* CAUTION */
721 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000722}
723
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724/* Set up tokenizer for string */
725
726struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000727PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000728{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600730 char *decoded;
731
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (tok == NULL)
733 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600734 decoded = decode_str(str, exec_input, tok);
735 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 PyTokenizer_Free(tok);
737 return NULL;
738 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000739
Andy Lester384f3c52020-02-27 20:44:52 -0600740 tok->buf = tok->cur = tok->inp = decoded;
741 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743}
744
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000745struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000746PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000747{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600749 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 if (tok == NULL)
751 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600752 tok->input = translated = translate_newlines(str, exec_input, tok);
753 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 PyTokenizer_Free(tok);
755 return NULL;
756 }
757 tok->decoding_state = STATE_RAW;
758 tok->read_coding_spec = 1;
759 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600760 tok->str = translated;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100761 tok->encoding = (char *)PyMem_Malloc(6);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 if (!tok->encoding) {
763 PyTokenizer_Free(tok);
764 return NULL;
765 }
766 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000767
Andy Lester384f3c52020-02-27 20:44:52 -0600768 tok->buf = tok->cur = tok->inp = translated;
769 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000770 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000771}
772
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000773/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774
775struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300776PyTokenizer_FromFile(FILE *fp, const char* enc,
777 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 struct tok_state *tok = tok_new();
780 if (tok == NULL)
781 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100782 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 PyTokenizer_Free(tok);
784 return NULL;
785 }
786 tok->cur = tok->inp = tok->buf;
787 tok->end = tok->buf + BUFSIZ;
788 tok->fp = fp;
789 tok->prompt = ps1;
790 tok->nextprompt = ps2;
791 if (enc != NULL) {
792 /* Must copy encoding declaration since it
793 gets copied into the parse tree. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100794 tok->encoding = PyMem_Malloc(strlen(enc)+1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 if (!tok->encoding) {
796 PyTokenizer_Free(tok);
797 return NULL;
798 }
799 strcpy(tok->encoding, enc);
800 tok->decoding_state = STATE_NORMAL;
801 }
802 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803}
804
805
806/* Free a tok_state structure */
807
808void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000809PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100812 PyMem_Free(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 Py_XDECREF(tok->decoding_readline);
814 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200815 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 if (tok->fp != NULL && tok->buf != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100817 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 if (tok->input)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100819 PyMem_Free(tok->input);
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +0200820 if (tok->stdin_content)
821 PyMem_Free(tok->stdin_content);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100822 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823}
824
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000825/* Get next char, updating state; error code goes into tok->done */
826
827static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200828tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 for (;;) {
831 if (tok->cur != tok->inp) {
832 return Py_CHARMASK(*tok->cur++); /* Fast path */
833 }
834 if (tok->done != E_OK)
835 return EOF;
836 if (tok->fp == NULL) {
837 char *end = strchr(tok->inp, '\n');
838 if (end != NULL)
839 end++;
840 else {
841 end = strchr(tok->inp, '\0');
842 if (end == tok->inp) {
843 tok->done = E_EOF;
844 return EOF;
845 }
846 }
847 if (tok->start == NULL)
848 tok->buf = tok->cur;
849 tok->line_start = tok->cur;
850 tok->lineno++;
851 tok->inp = end;
852 return Py_CHARMASK(*tok->cur++);
853 }
854 if (tok->prompt != NULL) {
855 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000856 if (newtok != NULL) {
857 char *translated = translate_newlines(newtok, 0, tok);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100858 PyMem_Free(newtok);
Victor Stinner89e34362011-01-07 18:47:22 +0000859 if (translated == NULL)
860 return EOF;
861 newtok = translated;
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +0200862 if (tok->stdin_content == NULL) {
863 tok->stdin_content = PyMem_Malloc(strlen(translated) + 1);
864 if (tok->stdin_content == NULL) {
865 tok->done = E_NOMEM;
866 return EOF;
867 }
868 sprintf(tok->stdin_content, "%s", translated);
869 }
870 else {
871 char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1);
872 if (new_str == NULL) {
873 tok->done = E_NOMEM;
874 return EOF;
875 }
876 sprintf(new_str, "%s%s", tok->stdin_content, translated);
877 PyMem_Free(tok->stdin_content);
878 tok->stdin_content = new_str;
879 }
Victor Stinner89e34362011-01-07 18:47:22 +0000880 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000881 if (tok->encoding && newtok && *newtok) {
882 /* Recode to UTF-8 */
883 Py_ssize_t buflen;
884 const char* buf;
885 PyObject *u = translate_into_utf8(newtok, tok->encoding);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100886 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887 if (!u) {
888 tok->done = E_DECODE;
889 return EOF;
890 }
891 buflen = PyBytes_GET_SIZE(u);
892 buf = PyBytes_AS_STRING(u);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100893 newtok = PyMem_Malloc(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700894 if (newtok == NULL) {
895 Py_DECREF(u);
896 tok->done = E_NOMEM;
897 return EOF;
898 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 strcpy(newtok, buf);
900 Py_DECREF(u);
901 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 if (tok->nextprompt != NULL)
903 tok->prompt = tok->nextprompt;
904 if (newtok == NULL)
905 tok->done = E_INTR;
906 else if (*newtok == '\0') {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100907 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908 tok->done = E_EOF;
909 }
910 else if (tok->start != NULL) {
911 size_t start = tok->start - tok->buf;
912 size_t oldlen = tok->cur - tok->buf;
913 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000914 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 char *buf = tok->buf;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100916 buf = (char *)PyMem_Realloc(buf, newlen+1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000917 tok->lineno++;
918 if (buf == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100919 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000920 tok->buf = NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100921 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000922 tok->done = E_NOMEM;
923 return EOF;
924 }
925 tok->buf = buf;
926 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000927 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000928 tok->line_start = tok->cur;
929 strcpy(tok->buf + oldlen, newtok);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100930 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931 tok->inp = tok->buf + newlen;
932 tok->end = tok->inp + 1;
933 tok->start = tok->buf + start;
934 }
935 else {
936 tok->lineno++;
937 if (tok->buf != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100938 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000940 tok->cur = tok->buf;
941 tok->line_start = tok->buf;
942 tok->inp = strchr(tok->buf, '\0');
943 tok->end = tok->inp + 1;
944 }
945 }
946 else {
947 int done = 0;
948 Py_ssize_t cur = 0;
949 char *pt;
950 if (tok->start == NULL) {
951 if (tok->buf == NULL) {
952 tok->buf = (char *)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100953 PyMem_Malloc(BUFSIZ);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000954 if (tok->buf == NULL) {
955 tok->done = E_NOMEM;
956 return EOF;
957 }
958 tok->end = tok->buf + BUFSIZ;
959 }
960 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
961 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200962 if (!tok->decoding_erred)
963 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 done = 1;
965 }
966 else {
967 tok->done = E_OK;
968 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700969 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 }
971 }
972 else {
973 cur = tok->cur - tok->buf;
974 if (decoding_feof(tok)) {
975 tok->done = E_EOF;
976 done = 1;
977 }
978 else
979 tok->done = E_OK;
980 }
981 tok->lineno++;
982 /* Read until '\n' or EOF */
983 while (!done) {
984 Py_ssize_t curstart = tok->start == NULL ? -1 :
985 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700986 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000987 Py_ssize_t curvalid = tok->inp - tok->buf;
988 Py_ssize_t newsize = curvalid + BUFSIZ;
989 char *newbuf = tok->buf;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100990 newbuf = (char *)PyMem_Realloc(newbuf,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 newsize);
992 if (newbuf == NULL) {
993 tok->done = E_NOMEM;
994 tok->cur = tok->inp;
995 return EOF;
996 }
997 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200998 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700999 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001000 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001001 tok->inp = tok->buf + curvalid;
1002 tok->end = tok->buf + newsize;
1003 tok->start = curstart < 0 ? NULL :
1004 tok->buf + curstart;
1005 if (decoding_fgets(tok->inp,
1006 (int)(tok->end - tok->inp),
1007 tok) == NULL) {
1008 /* Break out early on decoding
1009 errors, as tok->buf will be NULL
1010 */
1011 if (tok->decoding_erred)
1012 return EOF;
1013 /* Last line does not end in \n,
1014 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -07001015 if (tok->inp[-1] != '\n')
1016 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 }
1018 tok->inp = strchr(tok->inp, '\0');
1019 done = tok->inp[-1] == '\n';
1020 }
1021 if (tok->buf != NULL) {
1022 tok->cur = tok->buf + cur;
1023 tok->line_start = tok->cur;
1024 /* replace "\r\n" with "\n" */
1025 /* For Mac leave the \r, giving a syntax error */
1026 pt = tok->inp - 2;
1027 if (pt >= tok->buf && *pt == '\r') {
1028 *pt++ = '\n';
1029 *pt = '\0';
1030 tok->inp = pt;
1031 }
1032 }
1033 }
1034 if (tok->done != E_OK) {
1035 if (tok->prompt != NULL)
1036 PySys_WriteStderr("\n");
1037 tok->cur = tok->inp;
1038 return EOF;
1039 }
1040 }
1041 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001042}
1043
1044
1045/* Back-up one character */
1046
1047static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001048tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001049{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001051 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001052 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001053 }
1054 if (*tok->cur != c) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 *tok->cur = c;
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001056 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001058}
1059
1060
Guido van Rossum926f13a1998-04-09 21:38:06 +00001061static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001062syntaxerror(struct tok_state *tok, const char *format, ...)
1063{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001064 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001065 va_list vargs;
1066#ifdef HAVE_STDARG_PROTOTYPES
1067 va_start(vargs, format);
1068#else
1069 va_start(vargs);
1070#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001071 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001072 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001073 if (!errmsg) {
1074 goto error;
1075 }
1076
1077 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1078 "replace");
1079 if (!errtext) {
1080 goto error;
1081 }
1082 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1083 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1084 if (line_len != tok->cur - tok->line_start) {
1085 Py_DECREF(errtext);
1086 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1087 "replace");
1088 }
1089 if (!errtext) {
1090 goto error;
1091 }
1092
1093 args = Py_BuildValue("(O(OiiN))", errmsg,
1094 tok->filename, tok->lineno, offset, errtext);
1095 if (args) {
1096 PyErr_SetObject(PyExc_SyntaxError, args);
1097 Py_DECREF(args);
1098 }
1099
1100error:
1101 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001102 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001103 return ERRORTOKEN;
1104}
1105
1106static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001107indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001108{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001109 tok->done = E_TABSPACE;
1110 tok->cur = tok->inp;
1111 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112}
1113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114/* Verify that the identifier follows PEP 3131.
1115 All identifier strings are guaranteed to be "ready" unicode objects.
1116 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001117static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001118verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001119{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001121 if (tok->decoding_erred)
1122 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001124 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001126 tok->done = E_DECODE;
1127 }
1128 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 tok->done = E_ERROR;
1130 }
1131 return 0;
1132 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001133 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1134 if (invalid < 0) {
1135 Py_DECREF(s);
1136 tok->done = E_ERROR;
1137 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001138 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001139 assert(PyUnicode_GET_LENGTH(s) > 0);
1140 if (invalid < PyUnicode_GET_LENGTH(s)) {
1141 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1142 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1143 /* Determine the offset in UTF-8 encoded input */
1144 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1145 if (s != NULL) {
1146 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1147 }
1148 if (s == NULL) {
1149 tok->done = E_ERROR;
1150 return 0;
1151 }
1152 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1153 }
1154 Py_DECREF(s);
1155 // PyUnicode_FromFormatV() does not support %X
1156 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001157 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001158 if (Py_UNICODE_ISPRINTABLE(ch)) {
1159 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1160 }
1161 else {
1162 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1163 }
1164 return 0;
1165 }
1166 Py_DECREF(s);
1167 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001168}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001169
Brett Cannona721aba2016-09-09 14:57:09 -07001170static int
1171tok_decimal_tail(struct tok_state *tok)
1172{
1173 int c;
1174
1175 while (1) {
1176 do {
1177 c = tok_nextc(tok);
1178 } while (isdigit(c));
1179 if (c != '_') {
1180 break;
1181 }
1182 c = tok_nextc(tok);
1183 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001184 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001185 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001186 return 0;
1187 }
1188 }
1189 return c;
1190}
1191
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192/* Get next token, after space stripping etc. */
1193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001194static int
Andy Lester384f3c52020-02-27 20:44:52 -06001195tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001196{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001197 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001199
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001201 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 tok->start = NULL;
1203 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001204
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 /* Get indentation level */
1206 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001207 int col = 0;
1208 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001209 tok->atbol = 0;
1210 for (;;) {
1211 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001212 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001213 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001214 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001216 col = (col / tok->tabsize + 1) * tok->tabsize;
1217 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001218 }
Brett Cannona721aba2016-09-09 14:57:09 -07001219 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001221 }
1222 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001224 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 }
1226 tok_backup(tok, c);
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001227 if (c == '#' || c == '\n' || c == '\\') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 /* Lines with only whitespace and/or comments
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001229 and/or a line continuation character
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 shouldn't affect the indentation and are
1231 not passed to the parser as NEWLINE tokens,
1232 except *totally* empty lines in interactive
1233 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001234 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001235 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001236 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001237 else if (tok->prompt != NULL && tok->lineno == 1) {
1238 /* In interactive mode, if the first line contains
1239 only spaces and/or a comment, let it through. */
1240 blankline = 0;
1241 col = altcol = 0;
1242 }
Brett Cannona721aba2016-09-09 14:57:09 -07001243 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001245 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001246 /* We can't jump back right here since we still
1247 may need to skip to the end of a comment */
1248 }
1249 if (!blankline && tok->level == 0) {
1250 if (col == tok->indstack[tok->indent]) {
1251 /* No change */
1252 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001253 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 }
1255 }
1256 else if (col > tok->indstack[tok->indent]) {
1257 /* Indent -- always one */
1258 if (tok->indent+1 >= MAXINDENT) {
1259 tok->done = E_TOODEEP;
1260 tok->cur = tok->inp;
1261 return ERRORTOKEN;
1262 }
1263 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001264 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 }
1266 tok->pendin++;
1267 tok->indstack[++tok->indent] = col;
1268 tok->altindstack[tok->indent] = altcol;
1269 }
1270 else /* col < tok->indstack[tok->indent] */ {
1271 /* Dedent -- any number, must be consistent */
1272 while (tok->indent > 0 &&
1273 col < tok->indstack[tok->indent]) {
1274 tok->pendin--;
1275 tok->indent--;
1276 }
1277 if (col != tok->indstack[tok->indent]) {
1278 tok->done = E_DEDENT;
1279 tok->cur = tok->inp;
1280 return ERRORTOKEN;
1281 }
1282 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001283 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 }
1285 }
1286 }
1287 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001288
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001290
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 /* Return pending indents/dedents */
1292 if (tok->pendin != 0) {
1293 if (tok->pendin < 0) {
1294 tok->pendin++;
1295 return DEDENT;
1296 }
1297 else {
1298 tok->pendin--;
1299 return INDENT;
1300 }
1301 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001302
Guido van Rossum495da292019-03-07 12:38:08 -08001303 /* Peek ahead at the next character */
1304 c = tok_nextc(tok);
1305 tok_backup(tok, c);
1306 /* Check if we are closing an async function */
1307 if (tok->async_def
1308 && !blankline
1309 /* Due to some implementation artifacts of type comments,
1310 * a TYPE_COMMENT at the start of a function won't set an
1311 * indentation level and it will produce a NEWLINE after it.
1312 * To avoid spuriously ending an async function due to this,
1313 * wait until we have some non-newline char in front of us. */
1314 && c != '\n'
1315 && tok->level == 0
1316 /* There was a NEWLINE after ASYNC DEF,
1317 so we're past the signature. */
1318 && tok->async_def_nl
1319 /* Current indentation level is less than where
1320 the async function was defined */
1321 && tok->async_def_indent >= tok->indent)
1322 {
1323 tok->async_def = 0;
1324 tok->async_def_indent = 0;
1325 tok->async_def_nl = 0;
1326 }
1327
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 tok->start = NULL;
1330 /* Skip spaces */
1331 do {
1332 c = tok_nextc(tok);
1333 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001334
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 /* Set start of current token */
1336 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001337
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001338 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001339 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001340 const char *prefix, *p, *type_start;
1341
Brett Cannona721aba2016-09-09 14:57:09 -07001342 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001344 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001345
1346 if (tok->type_comments) {
1347 p = tok->start;
1348 prefix = type_comment_prefix;
1349 while (*prefix && p < tok->cur) {
1350 if (*prefix == ' ') {
1351 while (*p == ' ' || *p == '\t') {
1352 p++;
1353 }
1354 } else if (*prefix == *p) {
1355 p++;
1356 } else {
1357 break;
1358 }
1359
1360 prefix++;
1361 }
1362
1363 /* This is a type comment if we matched all of type_comment_prefix. */
1364 if (!*prefix) {
1365 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001366 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001367 tok_backup(tok, c); /* don't eat the newline or EOF */
1368
1369 type_start = p;
1370
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001371 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001372 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001373 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001374 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001375 && !(tok->cur > ignore_end
1376 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001377
1378 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001379 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001380 *p_end = tok->cur;
1381
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001382 /* If this type ignore is the only thing on the line, consume the newline also. */
1383 if (blankline) {
1384 tok_nextc(tok);
1385 tok->atbol = 1;
1386 }
1387 return TYPE_IGNORE;
1388 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001389 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001390 *p_end = tok->cur;
1391 return TYPE_COMMENT;
1392 }
1393 }
1394 }
Brett Cannona721aba2016-09-09 14:57:09 -07001395 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 /* Check for EOF and errors now */
1398 if (c == EOF) {
1399 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1400 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 /* Identifier (most frequent token!) */
1403 nonascii = 0;
1404 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001405 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001406 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001407 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001408 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001409 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001410 /* Since this is a backwards compatibility support literal we don't
1411 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001412 else if (!(saw_b || saw_u || saw_r || saw_f)
1413 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001414 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001415 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001416 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001417 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001418 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001419 }
1420 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001421 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001422 }
1423 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001424 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001425 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001427 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001429 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 }
1431 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001432 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001434 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 c = tok_nextc(tok);
1436 }
1437 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001438 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001440 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001441
1442 *p_start = tok->start;
1443 *p_end = tok->cur;
1444
Guido van Rossum495da292019-03-07 12:38:08 -08001445 /* async/await parsing block. */
1446 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1447 /* May be an 'async' or 'await' token. For Python 3.7 or
1448 later we recognize them unconditionally. For Python
1449 3.5 or 3.6 we recognize 'async' in front of 'def', and
1450 either one inside of 'async def'. (Technically we
1451 shouldn't recognize these at all for 3.4 or earlier,
1452 but there's no *valid* Python 3.4 code that would be
1453 rejected, and async functions will be rejected in a
1454 later phase.) */
1455 if (!tok->async_hacks || tok->async_def) {
1456 /* Always recognize the keywords. */
1457 if (memcmp(tok->start, "async", 5) == 0) {
1458 return ASYNC;
1459 }
1460 if (memcmp(tok->start, "await", 5) == 0) {
1461 return AWAIT;
1462 }
1463 }
1464 else if (memcmp(tok->start, "async", 5) == 0) {
1465 /* The current token is 'async'.
1466 Look ahead one token to see if that is 'def'. */
1467
1468 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001469 const char *ahead_tok_start = NULL;
1470 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001471 int ahead_tok_kind;
1472
1473 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1474 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1475 &ahead_tok_end);
1476
1477 if (ahead_tok_kind == NAME
1478 && ahead_tok.cur - ahead_tok.start == 3
1479 && memcmp(ahead_tok.start, "def", 3) == 0)
1480 {
1481 /* The next token is going to be 'def', so instead of
1482 returning a plain NAME token, return ASYNC. */
1483 tok->async_def_indent = tok->indent;
1484 tok->async_def = 1;
1485 return ASYNC;
1486 }
1487 }
1488 }
1489
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 return NAME;
1491 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Newline */
1494 if (c == '\n') {
1495 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001496 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001498 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 *p_start = tok->start;
1500 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1501 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001502 if (tok->async_def) {
1503 /* We're somewhere inside an 'async def' function, and
1504 we've encountered a NEWLINE after its signature. */
1505 tok->async_def_nl = 1;
1506 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 return NEWLINE;
1508 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001509
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001510 /* Period or number starting with period? */
1511 if (c == '.') {
1512 c = tok_nextc(tok);
1513 if (isdigit(c)) {
1514 goto fraction;
1515 } else if (c == '.') {
1516 c = tok_nextc(tok);
1517 if (c == '.') {
1518 *p_start = tok->start;
1519 *p_end = tok->cur;
1520 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001521 }
1522 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001523 tok_backup(tok, c);
1524 }
1525 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001526 }
1527 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 tok_backup(tok, c);
1529 }
1530 *p_start = tok->start;
1531 *p_end = tok->cur;
1532 return DOT;
1533 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001534
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001535 /* Number */
1536 if (isdigit(c)) {
1537 if (c == '0') {
1538 /* Hex, octal or binary -- maybe. */
1539 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001540 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001541 /* Hex */
1542 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001544 if (c == '_') {
1545 c = tok_nextc(tok);
1546 }
1547 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001548 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001549 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001550 }
1551 do {
1552 c = tok_nextc(tok);
1553 } while (isxdigit(c));
1554 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 }
1556 else if (c == 'o' || c == 'O') {
1557 /* Octal */
1558 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001560 if (c == '_') {
1561 c = tok_nextc(tok);
1562 }
1563 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001564 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001565 if (isdigit(c)) {
1566 return syntaxerror(tok,
1567 "invalid digit '%c' in octal literal", c);
1568 }
1569 else {
1570 return syntaxerror(tok, "invalid octal literal");
1571 }
Brett Cannona721aba2016-09-09 14:57:09 -07001572 }
1573 do {
1574 c = tok_nextc(tok);
1575 } while ('0' <= c && c < '8');
1576 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001577 if (isdigit(c)) {
1578 return syntaxerror(tok,
1579 "invalid digit '%c' in octal literal", c);
1580 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001581 }
1582 else if (c == 'b' || c == 'B') {
1583 /* Binary */
1584 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001586 if (c == '_') {
1587 c = tok_nextc(tok);
1588 }
1589 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001590 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001591 if (isdigit(c)) {
1592 return syntaxerror(tok,
1593 "invalid digit '%c' in binary literal", c);
1594 }
1595 else {
1596 return syntaxerror(tok, "invalid binary literal");
1597 }
Brett Cannona721aba2016-09-09 14:57:09 -07001598 }
1599 do {
1600 c = tok_nextc(tok);
1601 } while (c == '0' || c == '1');
1602 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001603 if (isdigit(c)) {
1604 return syntaxerror(tok,
1605 "invalid digit '%c' in binary literal", c);
1606 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 }
1608 else {
1609 int nonzero = 0;
1610 /* maybe old-style octal; c is first char of it */
1611 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001612 while (1) {
1613 if (c == '_') {
1614 c = tok_nextc(tok);
1615 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001616 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001617 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001618 }
1619 }
1620 if (c != '0') {
1621 break;
1622 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 c = tok_nextc(tok);
1624 }
Brett Cannona721aba2016-09-09 14:57:09 -07001625 if (isdigit(c)) {
1626 nonzero = 1;
1627 c = tok_decimal_tail(tok);
1628 if (c == 0) {
1629 return ERRORTOKEN;
1630 }
1631 }
1632 if (c == '.') {
1633 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001635 }
1636 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001638 }
1639 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001640 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001641 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001643 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001645 return syntaxerror(tok,
1646 "leading zeros in decimal integer "
1647 "literals are not permitted; "
1648 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001649 }
1650 }
1651 }
1652 else {
1653 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001654 c = tok_decimal_tail(tok);
1655 if (c == 0) {
1656 return ERRORTOKEN;
1657 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 {
1659 /* Accept floating point numbers. */
1660 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001661 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 fraction:
1663 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001664 if (isdigit(c)) {
1665 c = tok_decimal_tail(tok);
1666 if (c == 0) {
1667 return ERRORTOKEN;
1668 }
1669 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 }
1671 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001672 int e;
1673 exponent:
1674 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 /* Exponent part */
1676 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001677 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001679 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001680 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001681 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001682 }
1683 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001685 tok_backup(tok, e);
1686 *p_start = tok->start;
1687 *p_end = tok->cur;
1688 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 }
Brett Cannona721aba2016-09-09 14:57:09 -07001690 c = tok_decimal_tail(tok);
1691 if (c == 0) {
1692 return ERRORTOKEN;
1693 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001694 }
Brett Cannona721aba2016-09-09 14:57:09 -07001695 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 /* Imaginary part */
1697 imaginary:
1698 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001699 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 }
1701 }
1702 tok_backup(tok, c);
1703 *p_start = tok->start;
1704 *p_end = tok->cur;
1705 return NUMBER;
1706 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001707
1708 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001709 /* String */
1710 if (c == '\'' || c == '"') {
1711 int quote = c;
1712 int quote_size = 1; /* 1 or 3 */
1713 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001714
Anthony Sottile995d9b92019-01-12 20:05:13 -08001715 /* Nodes of type STRING, especially multi line strings
1716 must be handled differently in order to get both
1717 the starting line number and the column offset right.
1718 (cf. issue 16806) */
1719 tok->first_lineno = tok->lineno;
1720 tok->multi_line_start = tok->line_start;
1721
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 /* Find the quote size and start of string */
1723 c = tok_nextc(tok);
1724 if (c == quote) {
1725 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001726 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001728 }
1729 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001731 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 }
Brett Cannona721aba2016-09-09 14:57:09 -07001733 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001734 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001735 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001736
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 /* Get rest of string */
1738 while (end_quote_size != quote_size) {
1739 c = tok_nextc(tok);
1740 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001741 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001743 }
1744 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001746 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 tok->cur = tok->inp;
1748 return ERRORTOKEN;
1749 }
1750 if (quote_size == 1 && c == '\n') {
1751 tok->done = E_EOLS;
1752 tok->cur = tok->inp;
1753 return ERRORTOKEN;
1754 }
Brett Cannona721aba2016-09-09 14:57:09 -07001755 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001757 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 else {
1759 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001760 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001761 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001762 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001763 }
1764 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001765
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 *p_start = tok->start;
1767 *p_end = tok->cur;
1768 return STRING;
1769 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001770
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001771 /* Line continuation */
1772 if (c == '\\') {
1773 c = tok_nextc(tok);
1774 if (c != '\n') {
1775 tok->done = E_LINECONT;
1776 tok->cur = tok->inp;
1777 return ERRORTOKEN;
1778 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001779 c = tok_nextc(tok);
1780 if (c == EOF) {
1781 tok->done = E_EOF;
1782 tok->cur = tok->inp;
1783 return ERRORTOKEN;
1784 } else {
1785 tok_backup(tok, c);
1786 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001787 tok->cont_line = 1;
1788 goto again; /* Read next line */
1789 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001790
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001791 /* Check for two-character token */
1792 {
1793 int c2 = tok_nextc(tok);
1794 int token = PyToken_TwoChars(c, c2);
1795 if (token != OP) {
1796 int c3 = tok_nextc(tok);
1797 int token3 = PyToken_ThreeChars(c, c2, c3);
1798 if (token3 != OP) {
1799 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001800 }
1801 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 tok_backup(tok, c3);
1803 }
1804 *p_start = tok->start;
1805 *p_end = tok->cur;
1806 return token;
1807 }
1808 tok_backup(tok, c2);
1809 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001810
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 /* Keep track of parentheses nesting level */
1812 switch (c) {
1813 case '(':
1814 case '[':
1815 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001816 if (tok->level >= MAXLEVEL) {
1817 return syntaxerror(tok, "too many nested parentheses");
1818 }
1819 tok->parenstack[tok->level] = c;
1820 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001821 tok->level++;
1822 break;
1823 case ')':
1824 case ']':
1825 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001826 if (!tok->level) {
1827 return syntaxerror(tok, "unmatched '%c'", c);
1828 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001829 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001830 int opening = tok->parenstack[tok->level];
1831 if (!((opening == '(' && c == ')') ||
1832 (opening == '[' && c == ']') ||
1833 (opening == '{' && c == '}')))
1834 {
1835 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1836 return syntaxerror(tok,
1837 "closing parenthesis '%c' does not match "
1838 "opening parenthesis '%c' on line %d",
1839 c, opening, tok->parenlinenostack[tok->level]);
1840 }
1841 else {
1842 return syntaxerror(tok,
1843 "closing parenthesis '%c' does not match "
1844 "opening parenthesis '%c'",
1845 c, opening);
1846 }
1847 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001848 break;
1849 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001850
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 /* Punctuation character */
1852 *p_start = tok->start;
1853 *p_end = tok->cur;
1854 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001855}
1856
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001857int
Andy Lester384f3c52020-02-27 20:44:52 -06001858PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001859{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001860 int result = tok_get(tok, p_start, p_end);
1861 if (tok->decoding_erred) {
1862 result = ERRORTOKEN;
1863 tok->done = E_DECODE;
1864 }
1865 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001866}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001867
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001868/* Get the encoding of a Python file. Check for the coding cookie and check if
1869 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001870
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001871 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1872 encoding in the first or second line of the file (in which case the encoding
1873 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001874
Victor Stinner00d7abd2020-12-01 09:56:42 +01001875 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001876 by the caller. */
1877
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001878char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001879PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001880{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001881 struct tok_state *tok;
1882 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001883 const char *p_start = NULL;
1884 const char *p_end = NULL;
1885 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001886
Victor Stinnerdaf45552013-08-28 00:53:59 +02001887 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001888 if (fd < 0) {
1889 return NULL;
1890 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001891
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001892 fp = fdopen(fd, "r");
1893 if (fp == NULL) {
1894 return NULL;
1895 }
1896 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1897 if (tok == NULL) {
1898 fclose(fp);
1899 return NULL;
1900 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001901 if (filename != NULL) {
1902 Py_INCREF(filename);
1903 tok->filename = filename;
1904 }
1905 else {
1906 tok->filename = PyUnicode_FromString("<string>");
1907 if (tok->filename == NULL) {
1908 fclose(fp);
1909 PyTokenizer_Free(tok);
1910 return encoding;
1911 }
1912 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001913 while (tok->lineno < 2 && tok->done == E_OK) {
1914 PyTokenizer_Get(tok, &p_start, &p_end);
1915 }
1916 fclose(fp);
1917 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01001918 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001919 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301920 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001921 }
1922 PyTokenizer_Free(tok);
1923 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001924}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001925
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001926char *
1927PyTokenizer_FindEncoding(int fd)
1928{
1929 return PyTokenizer_FindEncodingFilename(fd, NULL);
1930}
1931
Guido van Rossum408027e1996-12-30 16:17:54 +00001932#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001933
1934void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001935tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001936{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001937 printf("%s", _PyParser_TokenNames[type]);
1938 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1939 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001940}
1941
1942#endif