blob: 3738a9021fcd78b98db233c291dc1974bc1c1ca5 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034
Guido van Rossum4fe87291992-02-26 15:24:44 +000035/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Brett Cannond5ec98c2007-10-20 02:54:14 +000043
Guido van Rossumdcfcd142019-01-31 03:40:27 -080044/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000051tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052{
Victor Stinner00d7abd2020-12-01 09:56:42 +010053 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060057 tok->buf = tok->cur = tok->inp = NULL;
Pablo Galindocd8dcbc2021-03-14 04:38:40 +010058 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060061 tok->start = NULL;
62 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Guido van Rossum495da292019-03-07 12:38:08 -080084 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -070088 tok->interactive_underflow = IUNDERFLOW_NORMAL;
Pablo Galindo Salgado07cf66f2021-11-21 04:15:22 +000089 tok->str = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000091}
92
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000093static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070094new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095{
Victor Stinner00d7abd2020-12-01 09:56:42 +010096 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 memcpy(result, s, len);
102 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000104}
105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100111 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
Pablo Galindo261a4522021-03-28 23:48:05 +0100166 if (memcmp(t, "coding", 6) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
Pablo Galindo261a4522021-03-28 23:48:05 +0100173 } while (t[0] == ' ' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100187 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200210 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 /* It's a continuation line, so it can't be a coding spec. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100212 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100215 if (!get_coding_spec(line, &cs, size, tok)) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700216 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100217 }
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100226 tok->decoding_state = STATE_NORMAL;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200227 break;
228 }
229 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200231 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100232 tok->decoding_state = STATE_NORMAL;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (tok->encoding == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 } else { /* then, compare cs with BOM */
Pablo Galindo261a4522021-03-28 23:48:05 +0100243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
Pablo Galindo261a4522021-03-28 23:48:05 +0100247 PyMem_Free(cs);
248 return 0;
249 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100250 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100252 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253}
254
255/* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
260check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
Pablo Galindo261a4522021-03-28 23:48:05 +0100267 tok->decoding_state = STATE_SEEK_CODING;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000284#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100313 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 /* No need to set_readline: input is already utf-8 */
318 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319}
320
Pablo Galindo261a4522021-03-28 23:48:05 +0100321static int
322tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348}
349
350
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 stored the result in tok->decoding_buffer
Pablo Galindo261a4522021-03-28 23:48:05 +0100358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
Pablo Galindo261a4522021-03-28 23:48:05 +0100361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Pablo Galindo261a4522021-03-28 23:48:05 +0100363 reached): see tok_nextc and its calls to tok_reserve_buf.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
Pablo Galindo261a4522021-03-28 23:48:05 +0100366static int
367tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368{
Pablo Galindo261a4522021-03-28 23:48:05 +0100369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
Pablo Galindo261a4522021-03-28 23:48:05 +0100377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
Pablo Galindo261a4522021-03-28 23:48:05 +0100389 }
390 return 1;
391}
392
393static int
394tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 Py_ssize_t buflen;
Pablo Galindo261a4522021-03-28 23:48:05 +0100398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 goto error;
404 }
405 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100406 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 tok->decoding_buffer = NULL;
Pablo Galindo261a4522021-03-28 23:48:05 +0100408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426error:
Pablo Galindo261a4522021-03-28 23:48:05 +0100427 Py_XDECREF(line);
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429}
430
431/* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441static int
442fp_setreadl(struct tok_state *tok, const char* enc)
443{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000447 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200448 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
Victor Stinner22a351a2010-10-14 12:04:34 +0000450 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200451 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200456 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700460 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000461 }
462
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700469 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700471 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300477 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700478
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100479 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100480 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100484 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700486 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487}
488
489/* Fetch the next byte from TOK. */
490
491static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493}
494
495/* Unfetch the last byte back into TOK. */
496
497static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499}
500
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000501/* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
504static int valid_utf8(const unsigned char* s)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000527}
528
Pablo Galindo261a4522021-03-28 23:48:05 +0100529static int
530ensure_utf8(char *line, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 int badchar = 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000538 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000539 }
540 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
Pablo Galindo261a4522021-03-28 23:48:05 +0100543 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200544 PyErr_Format(PyExc_SyntaxError,
Pablo Galindo261a4522021-03-28 23:48:05 +0100545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
Miss Islington (bot)f7f1c262021-07-30 07:25:28 -0700548 "see https://python.org/dev/peps/pep-0263/ for details",
Pablo Galindo261a4522021-03-28 23:48:05 +0100549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100552 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Fetch a byte from TOK, using the string buffer. */
556
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000557static int
558buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562/* Unfetch a byte from TOK, using the string buffer. */
563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000564static void
565buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568}
569
570/* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000573static int
574buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 tok->enc = enc;
576 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577}
578
579/* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582static PyObject *
583translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000593
594static char *
595translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 char *buf, *current;
599 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100600 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000629 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100631 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000632 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100633 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000634 }
635 buf = result;
636 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000638}
639
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640/* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
Andy Lester384f3c52020-02-27 20:44:52 -0600644static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000645decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600648 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
Pablo Galindo261a4522021-03-28 23:48:05 +0100686 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699}
700
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701/* Set up tokenizer for string */
702
703struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600707 char *decoded;
708
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (tok == NULL)
710 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyTokenizer_Free(tok);
714 return NULL;
715 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716
Andy Lester384f3c52020-02-27 20:44:52 -0600717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720}
721
Pablo Galindo261a4522021-03-28 23:48:05 +0100722/* Set up tokenizer for UTF-8 string */
723
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000724struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000725PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600728 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 if (tok == NULL)
730 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyTokenizer_Free(tok);
734 return NULL;
735 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100736 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600738 tok->str = translated;
Pablo Galindo261a4522021-03-28 23:48:05 +0100739 tok->encoding = new_string("utf-8", 5, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744
Andy Lester384f3c52020-02-27 20:44:52 -0600745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748}
749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300753PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100771 tok->encoding = new_string(enc, strlen(enc), tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779}
780
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781/* Free a tok_state structure */
782
783void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000784PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785{
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100786 if (tok->encoding != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100787 PyMem_Free(tok->encoding);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100788 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200791 Py_XDECREF(tok->filename);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100792 if (tok->fp != NULL && tok->buf != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100793 PyMem_Free(tok->buf);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100794 }
795 if (tok->input) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100796 PyMem_Free(tok->input);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100801 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
Pablo Galindo261a4522021-03-28 23:48:05 +0100804static int
805tok_readline_raw(struct tok_state *tok)
806{
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
Miss Islington (bot)94483f12021-12-12 08:52:49 -0800821 tok->inp = strchr(tok->inp, '\0');
822 if (tok->inp == tok->buf) {
Pablo Galindo92a02c12021-03-30 00:24:49 +0100823 return 0;
824 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100825 } while (tok->inp[-1] != '\n');
826 return 1;
827}
828
829static int
830tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849}
850
851static int
852tok_underflow_interactive(struct tok_state *tok) {
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -0700853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
Miss Islington (bot)91e88892022-02-03 15:32:22 -0800857 char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
Pablo Galindo261a4522021-03-28 23:48:05 +0100858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933}
934
935static int
936tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Pablo Galindo261a4522021-03-28 23:48:05 +0100979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
Miss Islington (bot)94483f12021-12-12 08:52:49 -0800986 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987 error_ret(tok);
988 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100989 }
990 assert(tok->done == E_OK);
991 return tok->done == E_OK;
992}
993
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -0700994#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +0100995static void
996print_escape(FILE *f, const char *s, Py_ssize_t size)
997{
998 if (s == NULL) {
999 fputs("NULL", f);
1000 return;
1001 }
1002 putc('"', f);
1003 while (size-- > 0) {
1004 unsigned char c = *s++;
1005 switch (c) {
1006 case '\n': fputs("\\n", f); break;
1007 case '\r': fputs("\\r", f); break;
1008 case '\t': fputs("\\t", f); break;
1009 case '\f': fputs("\\f", f); break;
1010 case '\'': fputs("\\'", f); break;
1011 case '"': fputs("\\\"", f); break;
1012 default:
1013 if (0x20 <= c && c <= 0x7f)
1014 putc(c, f);
1015 else
1016 fprintf(f, "\\x%02x", c);
1017 }
1018 }
1019 putc('"', f);
1020}
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -07001021#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001022
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001023/* Get next char, updating state; error code goes into tok->done */
1024
1025static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001026tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001027{
Pablo Galindo261a4522021-03-28 23:48:05 +01001028 int rc;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 for (;;) {
1030 if (tok->cur != tok->inp) {
1031 return Py_CHARMASK(*tok->cur++); /* Fast path */
1032 }
1033 if (tok->done != E_OK)
1034 return EOF;
1035 if (tok->fp == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +01001036 rc = tok_underflow_string(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001038 else if (tok->prompt != NULL) {
1039 rc = tok_underflow_interactive(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 }
1041 else {
Pablo Galindo261a4522021-03-28 23:48:05 +01001042 rc = tok_underflow_file(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001044#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +01001045 if (Py_DebugFlag) {
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001046 fprintf(stderr, "line[%d] = ", tok->lineno);
Miss Islington (bot)d8ca47c2021-10-29 10:21:15 -07001047 print_escape(stderr, tok->cur, tok->inp - tok->cur);
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001048 fprintf(stderr, " tok->done = %d\n", tok->done);
Pablo Galindo261a4522021-03-28 23:48:05 +01001049 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001050#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001051 if (!rc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001052 tok->cur = tok->inp;
1053 return EOF;
1054 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001055 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001057 Py_UNREACHABLE();
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001058}
1059
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001060/* Back-up one character */
1061
1062static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001063tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001066 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001067 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001068 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001069 if ((int)(unsigned char)*tok->cur != c) {
1070 Py_FatalError("tok_backup: wrong character");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001071 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001073}
1074
Guido van Rossum926f13a1998-04-09 21:38:06 +00001075static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001076_syntaxerror_range(struct tok_state *tok, const char *format,
1077 int col_offset, int end_col_offset,
1078 va_list vargs)
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001079{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001080 PyObject *errmsg, *errtext, *args;
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001081 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001082 if (!errmsg) {
1083 goto error;
1084 }
1085
1086 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1087 "replace");
1088 if (!errtext) {
1089 goto error;
1090 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001091
1092 if (col_offset == -1) {
1093 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1094 }
1095 if (end_col_offset == -1) {
1096 end_col_offset = col_offset;
1097 }
1098
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001099 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1100 if (line_len != tok->cur - tok->line_start) {
1101 Py_DECREF(errtext);
1102 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1103 "replace");
1104 }
1105 if (!errtext) {
1106 goto error;
1107 }
1108
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001109 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1110 col_offset, errtext, tok->lineno, end_col_offset);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001111 if (args) {
1112 PyErr_SetObject(PyExc_SyntaxError, args);
1113 Py_DECREF(args);
1114 }
1115
1116error:
1117 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001118 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001119 return ERRORTOKEN;
1120}
1121
1122static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001123syntaxerror(struct tok_state *tok, const char *format, ...)
1124{
1125 va_list vargs;
1126#ifdef HAVE_STDARG_PROTOTYPES
1127 va_start(vargs, format);
1128#else
1129 va_start(vargs);
1130#endif
1131 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1132 va_end(vargs);
1133 return ret;
1134}
1135
1136static int
1137syntaxerror_known_range(struct tok_state *tok,
1138 int col_offset, int end_col_offset,
1139 const char *format, ...)
1140{
1141 va_list vargs;
1142#ifdef HAVE_STDARG_PROTOTYPES
1143 va_start(vargs, format);
1144#else
1145 va_start(vargs);
1146#endif
1147 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1148 va_end(vargs);
1149 return ret;
1150}
1151
1152
1153
1154static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001155indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001156{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001157 tok->done = E_TABSPACE;
1158 tok->cur = tok->inp;
1159 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160}
1161
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001162static int
1163parser_warn(struct tok_state *tok, const char *format, ...)
1164{
1165 PyObject *errmsg;
1166 va_list vargs;
1167#ifdef HAVE_STDARG_PROTOTYPES
1168 va_start(vargs, format);
1169#else
1170 va_start(vargs);
1171#endif
1172 errmsg = PyUnicode_FromFormatV(format, vargs);
1173 va_end(vargs);
1174 if (!errmsg) {
1175 goto error;
1176 }
1177
1178 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1179 tok->lineno, NULL, NULL) < 0) {
1180 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1181 /* Replace the DeprecationWarning exception with a SyntaxError
1182 to get a more accurate error report */
1183 PyErr_Clear();
1184 syntaxerror(tok, "%U", errmsg);
1185 }
1186 goto error;
1187 }
1188 Py_DECREF(errmsg);
1189 return 0;
1190
1191error:
1192 Py_XDECREF(errmsg);
1193 tok->done = E_ERROR;
1194 return -1;
1195}
1196
1197static int
1198lookahead(struct tok_state *tok, const char *test)
1199{
1200 const char *s = test;
1201 int res = 0;
1202 while (1) {
1203 int c = tok_nextc(tok);
1204 if (*s == 0) {
1205 res = !is_potential_identifier_char(c);
1206 }
1207 else if (c == *s) {
1208 s++;
1209 continue;
1210 }
1211
1212 tok_backup(tok, c);
1213 while (s != test) {
1214 tok_backup(tok, *--s);
1215 }
1216 return res;
1217 }
1218}
1219
1220static int
1221verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1222{
1223 /* Emit a deprecation warning only if the numeric literal is immediately
1224 * followed by one of keywords which can occurr after a numeric literal
1225 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1226 * It allows to gradually deprecate existing valid code without adding
1227 * warning before error in most cases of invalid numeric literal (which
1228 * would be confusiong and break existing tests).
1229 * Raise a syntax error with slighly better message than plain
1230 * "invalid syntax" if the numeric literal is immediately followed by
1231 * other keyword or identifier.
1232 */
1233 int r = 0;
1234 if (c == 'a') {
1235 r = lookahead(tok, "nd");
1236 }
1237 else if (c == 'e') {
1238 r = lookahead(tok, "lse");
1239 }
1240 else if (c == 'f') {
1241 r = lookahead(tok, "or");
1242 }
1243 else if (c == 'i') {
1244 int c2 = tok_nextc(tok);
1245 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1246 r = 1;
1247 }
1248 tok_backup(tok, c2);
1249 }
1250 else if (c == 'o') {
1251 r = lookahead(tok, "r");
1252 }
1253 if (r) {
1254 tok_backup(tok, c);
1255 if (parser_warn(tok, "invalid %s literal", kind)) {
1256 return 0;
1257 }
1258 tok_nextc(tok);
1259 }
1260 else /* In future releases, only error will remain. */
1261 if (is_potential_identifier_char(c)) {
1262 tok_backup(tok, c);
1263 syntaxerror(tok, "invalid %s literal", kind);
1264 return 0;
1265 }
1266 return 1;
1267}
1268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269/* Verify that the identifier follows PEP 3131.
1270 All identifier strings are guaranteed to be "ready" unicode objects.
1271 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001272static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001273verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001276 if (tok->decoding_erred)
1277 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001279 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001281 tok->done = E_DECODE;
1282 }
1283 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 tok->done = E_ERROR;
1285 }
1286 return 0;
1287 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001288 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1289 if (invalid < 0) {
1290 Py_DECREF(s);
1291 tok->done = E_ERROR;
1292 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001293 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001294 assert(PyUnicode_GET_LENGTH(s) > 0);
1295 if (invalid < PyUnicode_GET_LENGTH(s)) {
1296 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1297 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1298 /* Determine the offset in UTF-8 encoded input */
1299 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1300 if (s != NULL) {
1301 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1302 }
1303 if (s == NULL) {
1304 tok->done = E_ERROR;
1305 return 0;
1306 }
1307 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1308 }
1309 Py_DECREF(s);
1310 // PyUnicode_FromFormatV() does not support %X
1311 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001312 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001313 if (Py_UNICODE_ISPRINTABLE(ch)) {
1314 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1315 }
1316 else {
1317 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1318 }
1319 return 0;
1320 }
1321 Py_DECREF(s);
1322 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001323}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001324
Brett Cannona721aba2016-09-09 14:57:09 -07001325static int
1326tok_decimal_tail(struct tok_state *tok)
1327{
1328 int c;
1329
1330 while (1) {
1331 do {
1332 c = tok_nextc(tok);
1333 } while (isdigit(c));
1334 if (c != '_') {
1335 break;
1336 }
1337 c = tok_nextc(tok);
1338 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001339 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001340 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001341 return 0;
1342 }
1343 }
1344 return c;
1345}
1346
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347/* Get next token, after space stripping etc. */
1348
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001349static inline int
1350tok_continuation_line(struct tok_state *tok) {
1351 int c = tok_nextc(tok);
1352 if (c != '\n') {
1353 tok->done = E_LINECONT;
1354 return -1;
1355 }
1356 c = tok_nextc(tok);
1357 if (c == EOF) {
1358 tok->done = E_EOF;
1359 tok->cur = tok->inp;
1360 return -1;
1361 } else {
1362 tok_backup(tok, c);
1363 }
1364 return c;
1365}
1366
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001367static int
Andy Lester384f3c52020-02-27 20:44:52 -06001368tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001370 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001372
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001374 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 tok->start = NULL;
1376 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 /* Get indentation level */
1379 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001380 int col = 0;
1381 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 tok->atbol = 0;
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001383 int cont_line_col = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 for (;;) {
1385 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001386 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001388 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001390 col = (col / tok->tabsize + 1) * tok->tabsize;
1391 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 }
Brett Cannona721aba2016-09-09 14:57:09 -07001393 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001395 }
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001396 else if (c == '\\') {
1397 // Indentation cannot be split over multiple physical lines
1398 // using backslashes. This means that if we found a backslash
1399 // preceded by whitespace, **the first one we find** determines
1400 // the level of indentation of whatever comes next.
1401 cont_line_col = cont_line_col ? cont_line_col : col;
1402 if ((c = tok_continuation_line(tok)) == -1) {
1403 return ERRORTOKEN;
1404 }
1405 }
Brett Cannona721aba2016-09-09 14:57:09 -07001406 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001408 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 }
1410 tok_backup(tok, c);
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001411 if (c == '#' || c == '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 /* Lines with only whitespace and/or comments
1413 shouldn't affect the indentation and are
1414 not passed to the parser as NEWLINE tokens,
1415 except *totally* empty lines in interactive
1416 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001417 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001419 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001420 else if (tok->prompt != NULL && tok->lineno == 1) {
1421 /* In interactive mode, if the first line contains
1422 only spaces and/or a comment, let it through. */
1423 blankline = 0;
1424 col = altcol = 0;
1425 }
Brett Cannona721aba2016-09-09 14:57:09 -07001426 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001428 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 /* We can't jump back right here since we still
1430 may need to skip to the end of a comment */
1431 }
1432 if (!blankline && tok->level == 0) {
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001433 col = cont_line_col ? cont_line_col : col;
1434 altcol = cont_line_col ? cont_line_col : altcol;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 if (col == tok->indstack[tok->indent]) {
1436 /* No change */
1437 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001438 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 }
1440 }
1441 else if (col > tok->indstack[tok->indent]) {
1442 /* Indent -- always one */
1443 if (tok->indent+1 >= MAXINDENT) {
1444 tok->done = E_TOODEEP;
1445 tok->cur = tok->inp;
1446 return ERRORTOKEN;
1447 }
1448 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001449 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 }
1451 tok->pendin++;
1452 tok->indstack[++tok->indent] = col;
1453 tok->altindstack[tok->indent] = altcol;
1454 }
1455 else /* col < tok->indstack[tok->indent] */ {
1456 /* Dedent -- any number, must be consistent */
1457 while (tok->indent > 0 &&
1458 col < tok->indstack[tok->indent]) {
1459 tok->pendin--;
1460 tok->indent--;
1461 }
1462 if (col != tok->indstack[tok->indent]) {
1463 tok->done = E_DEDENT;
1464 tok->cur = tok->inp;
1465 return ERRORTOKEN;
1466 }
1467 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001468 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 }
1470 }
1471 }
1472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001474 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001475
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001476 /* Return pending indents/dedents */
1477 if (tok->pendin != 0) {
1478 if (tok->pendin < 0) {
1479 tok->pendin++;
1480 return DEDENT;
1481 }
1482 else {
1483 tok->pendin--;
1484 return INDENT;
1485 }
1486 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487
Guido van Rossum495da292019-03-07 12:38:08 -08001488 /* Peek ahead at the next character */
1489 c = tok_nextc(tok);
1490 tok_backup(tok, c);
1491 /* Check if we are closing an async function */
1492 if (tok->async_def
1493 && !blankline
1494 /* Due to some implementation artifacts of type comments,
1495 * a TYPE_COMMENT at the start of a function won't set an
1496 * indentation level and it will produce a NEWLINE after it.
1497 * To avoid spuriously ending an async function due to this,
1498 * wait until we have some non-newline char in front of us. */
1499 && c != '\n'
1500 && tok->level == 0
1501 /* There was a NEWLINE after ASYNC DEF,
1502 so we're past the signature. */
1503 && tok->async_def_nl
1504 /* Current indentation level is less than where
1505 the async function was defined */
1506 && tok->async_def_indent >= tok->indent)
1507 {
1508 tok->async_def = 0;
1509 tok->async_def_indent = 0;
1510 tok->async_def_nl = 0;
1511 }
1512
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001514 tok->start = NULL;
1515 /* Skip spaces */
1516 do {
1517 c = tok_nextc(tok);
1518 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001519
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 /* Set start of current token */
1521 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001522
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001523 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001524 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001525 const char *prefix, *p, *type_start;
1526
Brett Cannona721aba2016-09-09 14:57:09 -07001527 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001529 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001530
1531 if (tok->type_comments) {
1532 p = tok->start;
1533 prefix = type_comment_prefix;
1534 while (*prefix && p < tok->cur) {
1535 if (*prefix == ' ') {
1536 while (*p == ' ' || *p == '\t') {
1537 p++;
1538 }
1539 } else if (*prefix == *p) {
1540 p++;
1541 } else {
1542 break;
1543 }
1544
1545 prefix++;
1546 }
1547
1548 /* This is a type comment if we matched all of type_comment_prefix. */
1549 if (!*prefix) {
1550 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001551 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001552 tok_backup(tok, c); /* don't eat the newline or EOF */
1553
1554 type_start = p;
1555
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001556 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001557 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001558 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001559 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001560 && !(tok->cur > ignore_end
1561 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001562
1563 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001564 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001565 *p_end = tok->cur;
1566
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001567 /* If this type ignore is the only thing on the line, consume the newline also. */
1568 if (blankline) {
1569 tok_nextc(tok);
1570 tok->atbol = 1;
1571 }
1572 return TYPE_IGNORE;
1573 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001574 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001575 *p_end = tok->cur;
1576 return TYPE_COMMENT;
1577 }
1578 }
1579 }
Brett Cannona721aba2016-09-09 14:57:09 -07001580 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -07001582 if (tok->done == E_INTERACT_STOP) {
1583 return ENDMARKER;
1584 }
1585
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001586 /* Check for EOF and errors now */
1587 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001588 if (tok->level) {
1589 return ERRORTOKEN;
1590 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1592 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001593
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 /* Identifier (most frequent token!) */
1595 nonascii = 0;
1596 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001597 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001598 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001599 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001600 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001601 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001602 /* Since this is a backwards compatibility support literal we don't
1603 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001604 else if (!(saw_b || saw_u || saw_r || saw_f)
1605 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001606 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001607 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001608 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001609 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001610 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001611 }
1612 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001613 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001614 }
1615 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001616 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001617 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001619 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001621 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 }
1623 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001624 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001626 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 c = tok_nextc(tok);
1628 }
1629 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001630 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001632 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001633
1634 *p_start = tok->start;
1635 *p_end = tok->cur;
1636
Guido van Rossum495da292019-03-07 12:38:08 -08001637 /* async/await parsing block. */
1638 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1639 /* May be an 'async' or 'await' token. For Python 3.7 or
1640 later we recognize them unconditionally. For Python
1641 3.5 or 3.6 we recognize 'async' in front of 'def', and
1642 either one inside of 'async def'. (Technically we
1643 shouldn't recognize these at all for 3.4 or earlier,
1644 but there's no *valid* Python 3.4 code that would be
1645 rejected, and async functions will be rejected in a
1646 later phase.) */
1647 if (!tok->async_hacks || tok->async_def) {
1648 /* Always recognize the keywords. */
1649 if (memcmp(tok->start, "async", 5) == 0) {
1650 return ASYNC;
1651 }
1652 if (memcmp(tok->start, "await", 5) == 0) {
1653 return AWAIT;
1654 }
1655 }
1656 else if (memcmp(tok->start, "async", 5) == 0) {
1657 /* The current token is 'async'.
1658 Look ahead one token to see if that is 'def'. */
1659
1660 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001661 const char *ahead_tok_start = NULL;
1662 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001663 int ahead_tok_kind;
1664
1665 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1666 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1667 &ahead_tok_end);
1668
1669 if (ahead_tok_kind == NAME
1670 && ahead_tok.cur - ahead_tok.start == 3
1671 && memcmp(ahead_tok.start, "def", 3) == 0)
1672 {
1673 /* The next token is going to be 'def', so instead of
1674 returning a plain NAME token, return ASYNC. */
1675 tok->async_def_indent = tok->indent;
1676 tok->async_def = 1;
1677 return ASYNC;
1678 }
1679 }
1680 }
1681
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 return NAME;
1683 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001684
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 /* Newline */
1686 if (c == '\n') {
1687 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001688 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001690 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691 *p_start = tok->start;
1692 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1693 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001694 if (tok->async_def) {
1695 /* We're somewhere inside an 'async def' function, and
1696 we've encountered a NEWLINE after its signature. */
1697 tok->async_def_nl = 1;
1698 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 return NEWLINE;
1700 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001701
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 /* Period or number starting with period? */
1703 if (c == '.') {
1704 c = tok_nextc(tok);
1705 if (isdigit(c)) {
1706 goto fraction;
1707 } else if (c == '.') {
1708 c = tok_nextc(tok);
1709 if (c == '.') {
1710 *p_start = tok->start;
1711 *p_end = tok->cur;
1712 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001713 }
1714 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 tok_backup(tok, c);
1716 }
1717 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001718 }
1719 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 tok_backup(tok, c);
1721 }
1722 *p_start = tok->start;
1723 *p_end = tok->cur;
1724 return DOT;
1725 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001726
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 /* Number */
1728 if (isdigit(c)) {
1729 if (c == '0') {
1730 /* Hex, octal or binary -- maybe. */
1731 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 /* Hex */
1734 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001736 if (c == '_') {
1737 c = tok_nextc(tok);
1738 }
1739 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001740 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001741 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001742 }
1743 do {
1744 c = tok_nextc(tok);
1745 } while (isxdigit(c));
1746 } while (c == '_');
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001747 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1748 return ERRORTOKEN;
1749 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750 }
1751 else if (c == 'o' || c == 'O') {
1752 /* Octal */
1753 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001755 if (c == '_') {
1756 c = tok_nextc(tok);
1757 }
1758 if (c < '0' || c >= '8') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001759 if (isdigit(c)) {
1760 return syntaxerror(tok,
1761 "invalid digit '%c' in octal literal", c);
1762 }
1763 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001764 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001765 return syntaxerror(tok, "invalid octal literal");
1766 }
Brett Cannona721aba2016-09-09 14:57:09 -07001767 }
1768 do {
1769 c = tok_nextc(tok);
1770 } while ('0' <= c && c < '8');
1771 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001772 if (isdigit(c)) {
1773 return syntaxerror(tok,
1774 "invalid digit '%c' in octal literal", c);
1775 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001776 if (!verify_end_of_number(tok, c, "octal")) {
1777 return ERRORTOKEN;
1778 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 }
1780 else if (c == 'b' || c == 'B') {
1781 /* Binary */
1782 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001784 if (c == '_') {
1785 c = tok_nextc(tok);
1786 }
1787 if (c != '0' && c != '1') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001788 if (isdigit(c)) {
1789 return syntaxerror(tok,
1790 "invalid digit '%c' in binary literal", c);
1791 }
1792 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001793 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001794 return syntaxerror(tok, "invalid binary literal");
1795 }
Brett Cannona721aba2016-09-09 14:57:09 -07001796 }
1797 do {
1798 c = tok_nextc(tok);
1799 } while (c == '0' || c == '1');
1800 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001801 if (isdigit(c)) {
1802 return syntaxerror(tok,
1803 "invalid digit '%c' in binary literal", c);
1804 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001805 if (!verify_end_of_number(tok, c, "binary")) {
1806 return ERRORTOKEN;
1807 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001808 }
1809 else {
1810 int nonzero = 0;
1811 /* maybe old-style octal; c is first char of it */
1812 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001813 while (1) {
1814 if (c == '_') {
1815 c = tok_nextc(tok);
1816 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001817 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001818 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001819 }
1820 }
1821 if (c != '0') {
1822 break;
1823 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001824 c = tok_nextc(tok);
1825 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001826 char* zeros_end = tok->cur;
Brett Cannona721aba2016-09-09 14:57:09 -07001827 if (isdigit(c)) {
1828 nonzero = 1;
1829 c = tok_decimal_tail(tok);
1830 if (c == 0) {
1831 return ERRORTOKEN;
1832 }
1833 }
1834 if (c == '.') {
1835 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001836 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001837 }
1838 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001840 }
1841 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001842 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001843 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001844 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001845 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001846 tok_backup(tok, c);
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001847 return syntaxerror_known_range(
1848 tok, (int)(tok->start + 1 - tok->line_start),
1849 (int)(zeros_end - tok->line_start),
1850 "leading zeros in decimal integer "
1851 "literals are not permitted; "
1852 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001853 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001854 if (!verify_end_of_number(tok, c, "decimal")) {
1855 return ERRORTOKEN;
1856 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001857 }
1858 }
1859 else {
1860 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001861 c = tok_decimal_tail(tok);
1862 if (c == 0) {
1863 return ERRORTOKEN;
1864 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001865 {
1866 /* Accept floating point numbers. */
1867 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001868 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001869 fraction:
1870 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001871 if (isdigit(c)) {
1872 c = tok_decimal_tail(tok);
1873 if (c == 0) {
1874 return ERRORTOKEN;
1875 }
1876 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 }
1878 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001879 int e;
1880 exponent:
1881 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001882 /* Exponent part */
1883 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001884 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001885 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001886 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001887 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001888 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001889 }
1890 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001891 tok_backup(tok, c);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001892 if (!verify_end_of_number(tok, e, "decimal")) {
1893 return ERRORTOKEN;
1894 }
Benjamin Petersonc4161622014-06-07 12:36:39 -07001895 tok_backup(tok, e);
1896 *p_start = tok->start;
1897 *p_end = tok->cur;
1898 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001899 }
Brett Cannona721aba2016-09-09 14:57:09 -07001900 c = tok_decimal_tail(tok);
1901 if (c == 0) {
1902 return ERRORTOKEN;
1903 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001904 }
Brett Cannona721aba2016-09-09 14:57:09 -07001905 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001906 /* Imaginary part */
1907 imaginary:
1908 c = tok_nextc(tok);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001909 if (!verify_end_of_number(tok, c, "imaginary")) {
1910 return ERRORTOKEN;
1911 }
1912 }
1913 else if (!verify_end_of_number(tok, c, "decimal")) {
1914 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001915 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001916 }
1917 }
1918 tok_backup(tok, c);
1919 *p_start = tok->start;
1920 *p_end = tok->cur;
1921 return NUMBER;
1922 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001923
1924 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001925 /* String */
1926 if (c == '\'' || c == '"') {
1927 int quote = c;
1928 int quote_size = 1; /* 1 or 3 */
1929 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001930
Anthony Sottile995d9b92019-01-12 20:05:13 -08001931 /* Nodes of type STRING, especially multi line strings
1932 must be handled differently in order to get both
1933 the starting line number and the column offset right.
1934 (cf. issue 16806) */
1935 tok->first_lineno = tok->lineno;
1936 tok->multi_line_start = tok->line_start;
1937
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001938 /* Find the quote size and start of string */
1939 c = tok_nextc(tok);
1940 if (c == quote) {
1941 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001942 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001943 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001944 }
1945 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001946 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001947 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001948 }
Brett Cannona721aba2016-09-09 14:57:09 -07001949 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001950 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001951 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001952
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001953 /* Get rest of string */
1954 while (end_quote_size != quote_size) {
1955 c = tok_nextc(tok);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001956 if (c == EOF || (quote_size == 1 && c == '\n')) {
Miss Islington (bot)d03f3422021-06-12 13:27:02 -07001957 assert(tok->multi_line_start != NULL);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001958 // shift the tok_state's location into
1959 // the start of string, and report the error
1960 // from the initial quote character
1961 tok->cur = (char *)tok->start;
1962 tok->cur++;
1963 tok->line_start = tok->multi_line_start;
1964 int start = tok->lineno;
1965 tok->lineno = tok->first_lineno;
1966
Brett Cannona721aba2016-09-09 14:57:09 -07001967 if (quote_size == 3) {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001968 return syntaxerror(tok,
1969 "unterminated triple-quoted string literal"
1970 " (detected at line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001971 }
1972 else {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001973 return syntaxerror(tok,
1974 "unterminated string literal (detected at"
1975 " line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001976 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001977 }
Brett Cannona721aba2016-09-09 14:57:09 -07001978 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001979 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001980 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001981 else {
1982 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001983 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001984 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001985 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001986 }
1987 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001988
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001989 *p_start = tok->start;
1990 *p_end = tok->cur;
1991 return STRING;
1992 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001993
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001994 /* Line continuation */
1995 if (c == '\\') {
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001996 if ((c = tok_continuation_line(tok)) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001997 return ERRORTOKEN;
1998 }
1999 tok->cont_line = 1;
2000 goto again; /* Read next line */
2001 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002003 /* Check for two-character token */
2004 {
2005 int c2 = tok_nextc(tok);
2006 int token = PyToken_TwoChars(c, c2);
2007 if (token != OP) {
2008 int c3 = tok_nextc(tok);
2009 int token3 = PyToken_ThreeChars(c, c2, c3);
2010 if (token3 != OP) {
2011 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07002012 }
2013 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002014 tok_backup(tok, c3);
2015 }
2016 *p_start = tok->start;
2017 *p_end = tok->cur;
2018 return token;
2019 }
2020 tok_backup(tok, c2);
2021 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002022
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002023 /* Keep track of parentheses nesting level */
2024 switch (c) {
2025 case '(':
2026 case '[':
2027 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002028 if (tok->level >= MAXLEVEL) {
2029 return syntaxerror(tok, "too many nested parentheses");
2030 }
2031 tok->parenstack[tok->level] = c;
2032 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindoae7d3cd92021-01-20 12:53:52 +00002033 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002034 tok->level++;
2035 break;
2036 case ')':
2037 case ']':
2038 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002039 if (!tok->level) {
2040 return syntaxerror(tok, "unmatched '%c'", c);
2041 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002042 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002043 int opening = tok->parenstack[tok->level];
2044 if (!((opening == '(' && c == ')') ||
2045 (opening == '[' && c == ']') ||
2046 (opening == '{' && c == '}')))
2047 {
2048 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2049 return syntaxerror(tok,
2050 "closing parenthesis '%c' does not match "
2051 "opening parenthesis '%c' on line %d",
2052 c, opening, tok->parenlinenostack[tok->level]);
2053 }
2054 else {
2055 return syntaxerror(tok,
2056 "closing parenthesis '%c' does not match "
2057 "opening parenthesis '%c'",
2058 c, opening);
2059 }
2060 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002061 break;
2062 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002063
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002064 /* Punctuation character */
2065 *p_start = tok->start;
2066 *p_end = tok->cur;
2067 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002068}
2069
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002070int
Andy Lester384f3c52020-02-27 20:44:52 -06002071PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002072{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002073 int result = tok_get(tok, p_start, p_end);
2074 if (tok->decoding_erred) {
2075 result = ERRORTOKEN;
2076 tok->done = E_DECODE;
2077 }
2078 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002079}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002080
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002081/* Get the encoding of a Python file. Check for the coding cookie and check if
2082 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002083
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002084 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2085 encoding in the first or second line of the file (in which case the encoding
2086 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00002087
Victor Stinner00d7abd2020-12-01 09:56:42 +01002088 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002089 by the caller. */
2090
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002091char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002092PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00002093{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002094 struct tok_state *tok;
2095 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06002096 const char *p_start = NULL;
2097 const char *p_end = NULL;
2098 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002099
Victor Stinnerdaf45552013-08-28 00:53:59 +02002100 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002101 if (fd < 0) {
2102 return NULL;
2103 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02002104
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002105 fp = fdopen(fd, "r");
2106 if (fp == NULL) {
2107 return NULL;
2108 }
2109 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2110 if (tok == NULL) {
2111 fclose(fp);
2112 return NULL;
2113 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002114 if (filename != NULL) {
2115 Py_INCREF(filename);
2116 tok->filename = filename;
2117 }
2118 else {
2119 tok->filename = PyUnicode_FromString("<string>");
2120 if (tok->filename == NULL) {
2121 fclose(fp);
2122 PyTokenizer_Free(tok);
2123 return encoding;
2124 }
2125 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002126 while (tok->lineno < 2 && tok->done == E_OK) {
2127 PyTokenizer_Get(tok, &p_start, &p_end);
2128 }
2129 fclose(fp);
2130 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01002131 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Pablo Galindo261a4522021-03-28 23:48:05 +01002132 if (encoding) {
Hansraj Das69f37bc2019-08-15 21:49:07 +05302133 strcpy(encoding, tok->encoding);
Pablo Galindo261a4522021-03-28 23:48:05 +01002134 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002135 }
2136 PyTokenizer_Free(tok);
2137 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002138}
Thomas Wouters89d996e2007-09-08 17:39:28 +00002139
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002140char *
2141PyTokenizer_FindEncoding(int fd)
2142{
2143 return PyTokenizer_FindEncodingFilename(fd, NULL);
2144}
2145
Guido van Rossum408027e1996-12-30 16:17:54 +00002146#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002147
2148void
Thomas Wouters23c9e002000-07-22 19:20:54 +00002149tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002150{
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002151 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002152 if (type == NAME || type == NUMBER || type == STRING || type == OP)
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002153 fprintf(stderr, "(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002154}
2155
2156#endif