blob: 267ccec33fd77081f21715f3e7c41ab48059fff0 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034
Guido van Rossum4fe87291992-02-26 15:24:44 +000035/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Pablo Galindo Salgado5b58db72022-02-08 12:25:15 +000042static int syntaxerror(struct tok_state *tok, const char *format, ...);
Brett Cannond5ec98c2007-10-20 02:54:14 +000043
Guido van Rossumdcfcd142019-01-31 03:40:27 -080044/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000051tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052{
Victor Stinner00d7abd2020-12-01 09:56:42 +010053 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060057 tok->buf = tok->cur = tok->inp = NULL;
Pablo Galindocd8dcbc2021-03-14 04:38:40 +010058 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060061 tok->start = NULL;
62 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Guido van Rossum495da292019-03-07 12:38:08 -080084 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -070088 tok->interactive_underflow = IUNDERFLOW_NORMAL;
Pablo Galindo Salgado07cf66f2021-11-21 04:15:22 +000089 tok->str = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000091}
92
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000093static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070094new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095{
Victor Stinner00d7abd2020-12-01 09:56:42 +010096 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 memcpy(result, s, len);
102 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000104}
105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100111 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
Pablo Galindo261a4522021-03-28 23:48:05 +0100166 if (memcmp(t, "coding", 6) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
Pablo Galindo261a4522021-03-28 23:48:05 +0100173 } while (t[0] == ' ' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100187 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200210 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 /* It's a continuation line, so it can't be a coding spec. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100212 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100215 if (!get_coding_spec(line, &cs, size, tok)) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700216 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100217 }
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100226 tok->decoding_state = STATE_NORMAL;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200227 break;
228 }
229 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200231 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100232 tok->decoding_state = STATE_NORMAL;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (tok->encoding == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 } else { /* then, compare cs with BOM */
Pablo Galindo261a4522021-03-28 23:48:05 +0100243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
Pablo Galindo261a4522021-03-28 23:48:05 +0100247 PyMem_Free(cs);
248 return 0;
249 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100250 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100252 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253}
254
255/* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
260check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
Pablo Galindo261a4522021-03-28 23:48:05 +0100267 tok->decoding_state = STATE_SEEK_CODING;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000284#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100313 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 /* No need to set_readline: input is already utf-8 */
318 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319}
320
Pablo Galindo261a4522021-03-28 23:48:05 +0100321static int
322tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348}
349
350
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 stored the result in tok->decoding_buffer
Pablo Galindo261a4522021-03-28 23:48:05 +0100358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
Pablo Galindo261a4522021-03-28 23:48:05 +0100361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Pablo Galindo261a4522021-03-28 23:48:05 +0100363 reached): see tok_nextc and its calls to tok_reserve_buf.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
Pablo Galindo261a4522021-03-28 23:48:05 +0100366static int
367tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368{
Pablo Galindo261a4522021-03-28 23:48:05 +0100369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
Pablo Galindo261a4522021-03-28 23:48:05 +0100377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
Pablo Galindo261a4522021-03-28 23:48:05 +0100389 }
390 return 1;
391}
392
393static int
394tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 Py_ssize_t buflen;
Pablo Galindo261a4522021-03-28 23:48:05 +0100398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 goto error;
404 }
405 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100406 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 tok->decoding_buffer = NULL;
Pablo Galindo261a4522021-03-28 23:48:05 +0100408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426error:
Pablo Galindo261a4522021-03-28 23:48:05 +0100427 Py_XDECREF(line);
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429}
430
431/* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441static int
442fp_setreadl(struct tok_state *tok, const char* enc)
443{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000447 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200448 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
Victor Stinner22a351a2010-10-14 12:04:34 +0000450 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200451 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200456 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700460 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000461 }
462
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700469 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700471 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300477 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700478
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100479 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100480 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100484 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700486 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487}
488
489/* Fetch the next byte from TOK. */
490
491static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493}
494
495/* Unfetch the last byte back into TOK. */
496
497static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499}
500
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000501/* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
504static int valid_utf8(const unsigned char* s)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000527}
528
Pablo Galindo261a4522021-03-28 23:48:05 +0100529static int
530ensure_utf8(char *line, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 int badchar = 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000538 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000539 }
540 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
Pablo Galindo261a4522021-03-28 23:48:05 +0100543 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200544 PyErr_Format(PyExc_SyntaxError,
Pablo Galindo261a4522021-03-28 23:48:05 +0100545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
Miss Islington (bot)f7f1c262021-07-30 07:25:28 -0700548 "see https://python.org/dev/peps/pep-0263/ for details",
Pablo Galindo261a4522021-03-28 23:48:05 +0100549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100552 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Fetch a byte from TOK, using the string buffer. */
556
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000557static int
558buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562/* Unfetch a byte from TOK, using the string buffer. */
563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000564static void
565buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568}
569
570/* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000573static int
574buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 tok->enc = enc;
576 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577}
578
579/* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582static PyObject *
583translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000593
594static char *
595translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 char *buf, *current;
599 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100600 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000629 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100631 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000632 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100633 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000634 }
635 buf = result;
636 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000638}
639
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640/* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
Andy Lester384f3c52020-02-27 20:44:52 -0600644static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000645decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600648 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
Pablo Galindo261a4522021-03-28 23:48:05 +0100686 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699}
700
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701/* Set up tokenizer for string */
702
703struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600707 char *decoded;
708
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (tok == NULL)
710 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyTokenizer_Free(tok);
714 return NULL;
715 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716
Andy Lester384f3c52020-02-27 20:44:52 -0600717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720}
721
Pablo Galindo261a4522021-03-28 23:48:05 +0100722/* Set up tokenizer for UTF-8 string */
723
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000724struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000725PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600728 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 if (tok == NULL)
730 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyTokenizer_Free(tok);
734 return NULL;
735 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100736 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600738 tok->str = translated;
Pablo Galindo261a4522021-03-28 23:48:05 +0100739 tok->encoding = new_string("utf-8", 5, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744
Andy Lester384f3c52020-02-27 20:44:52 -0600745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748}
749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300753PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100771 tok->encoding = new_string(enc, strlen(enc), tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779}
780
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781/* Free a tok_state structure */
782
783void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000784PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785{
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100786 if (tok->encoding != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100787 PyMem_Free(tok->encoding);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100788 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200791 Py_XDECREF(tok->filename);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100792 if (tok->fp != NULL && tok->buf != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100793 PyMem_Free(tok->buf);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100794 }
795 if (tok->input) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100796 PyMem_Free(tok->input);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100801 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
Pablo Galindo261a4522021-03-28 23:48:05 +0100804static int
805tok_readline_raw(struct tok_state *tok)
806{
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
Miss Islington (bot)94483f12021-12-12 08:52:49 -0800821 tok->inp = strchr(tok->inp, '\0');
822 if (tok->inp == tok->buf) {
Pablo Galindo92a02c12021-03-30 00:24:49 +0100823 return 0;
824 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100825 } while (tok->inp[-1] != '\n');
826 return 1;
827}
828
829static int
830tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849}
850
851static int
852tok_underflow_interactive(struct tok_state *tok) {
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -0700853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
Miss Islington (bot)91e88892022-02-03 15:32:22 -0800857 char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
Pablo Galindo261a4522021-03-28 23:48:05 +0100858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933}
934
935static int
936tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Pablo Galindo261a4522021-03-28 23:48:05 +0100979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
Miss Islington (bot)94483f12021-12-12 08:52:49 -0800986 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987 error_ret(tok);
988 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100989 }
990 assert(tok->done == E_OK);
991 return tok->done == E_OK;
992}
993
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -0700994#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +0100995static void
996print_escape(FILE *f, const char *s, Py_ssize_t size)
997{
998 if (s == NULL) {
999 fputs("NULL", f);
1000 return;
1001 }
1002 putc('"', f);
1003 while (size-- > 0) {
1004 unsigned char c = *s++;
1005 switch (c) {
1006 case '\n': fputs("\\n", f); break;
1007 case '\r': fputs("\\r", f); break;
1008 case '\t': fputs("\\t", f); break;
1009 case '\f': fputs("\\f", f); break;
1010 case '\'': fputs("\\'", f); break;
1011 case '"': fputs("\\\"", f); break;
1012 default:
1013 if (0x20 <= c && c <= 0x7f)
1014 putc(c, f);
1015 else
1016 fprintf(f, "\\x%02x", c);
1017 }
1018 }
1019 putc('"', f);
1020}
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -07001021#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001022
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001023/* Get next char, updating state; error code goes into tok->done */
1024
1025static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001026tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001027{
Pablo Galindo261a4522021-03-28 23:48:05 +01001028 int rc;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 for (;;) {
1030 if (tok->cur != tok->inp) {
1031 return Py_CHARMASK(*tok->cur++); /* Fast path */
1032 }
Pablo Galindo Salgado5b58db72022-02-08 12:25:15 +00001033 if (tok->done != E_OK) {
1034 return EOF;
1035 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001036 if (tok->fp == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +01001037 rc = tok_underflow_string(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001039 else if (tok->prompt != NULL) {
1040 rc = tok_underflow_interactive(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 }
1042 else {
Pablo Galindo261a4522021-03-28 23:48:05 +01001043 rc = tok_underflow_file(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001044 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001045#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +01001046 if (Py_DebugFlag) {
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001047 fprintf(stderr, "line[%d] = ", tok->lineno);
Miss Islington (bot)d8ca47c2021-10-29 10:21:15 -07001048 print_escape(stderr, tok->cur, tok->inp - tok->cur);
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001049 fprintf(stderr, " tok->done = %d\n", tok->done);
Pablo Galindo261a4522021-03-28 23:48:05 +01001050 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001051#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001052 if (!rc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053 tok->cur = tok->inp;
1054 return EOF;
1055 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001056 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001058 Py_UNREACHABLE();
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001059}
1060
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061/* Back-up one character */
1062
1063static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001064tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001065{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001066 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001067 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001068 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001069 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001070 if ((int)(unsigned char)*tok->cur != c) {
1071 Py_FatalError("tok_backup: wrong character");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001072 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074}
1075
Guido van Rossum926f13a1998-04-09 21:38:06 +00001076static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001077_syntaxerror_range(struct tok_state *tok, const char *format,
1078 int col_offset, int end_col_offset,
1079 va_list vargs)
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001080{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001081 PyObject *errmsg, *errtext, *args;
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001082 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001083 if (!errmsg) {
1084 goto error;
1085 }
1086
1087 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1088 "replace");
1089 if (!errtext) {
1090 goto error;
1091 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001092
1093 if (col_offset == -1) {
1094 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1095 }
1096 if (end_col_offset == -1) {
1097 end_col_offset = col_offset;
1098 }
1099
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001100 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1101 if (line_len != tok->cur - tok->line_start) {
1102 Py_DECREF(errtext);
1103 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1104 "replace");
1105 }
1106 if (!errtext) {
1107 goto error;
1108 }
1109
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001110 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1111 col_offset, errtext, tok->lineno, end_col_offset);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001112 if (args) {
1113 PyErr_SetObject(PyExc_SyntaxError, args);
1114 Py_DECREF(args);
1115 }
1116
1117error:
1118 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001119 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001120 return ERRORTOKEN;
1121}
1122
1123static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001124syntaxerror(struct tok_state *tok, const char *format, ...)
1125{
1126 va_list vargs;
1127#ifdef HAVE_STDARG_PROTOTYPES
1128 va_start(vargs, format);
1129#else
1130 va_start(vargs);
1131#endif
1132 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1133 va_end(vargs);
1134 return ret;
1135}
1136
1137static int
1138syntaxerror_known_range(struct tok_state *tok,
1139 int col_offset, int end_col_offset,
1140 const char *format, ...)
1141{
1142 va_list vargs;
1143#ifdef HAVE_STDARG_PROTOTYPES
1144 va_start(vargs, format);
1145#else
1146 va_start(vargs);
1147#endif
1148 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1149 va_end(vargs);
1150 return ret;
1151}
1152
1153
1154
1155static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001156indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001157{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001158 tok->done = E_TABSPACE;
1159 tok->cur = tok->inp;
1160 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001161}
1162
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001163static int
1164parser_warn(struct tok_state *tok, const char *format, ...)
1165{
1166 PyObject *errmsg;
1167 va_list vargs;
1168#ifdef HAVE_STDARG_PROTOTYPES
1169 va_start(vargs, format);
1170#else
1171 va_start(vargs);
1172#endif
1173 errmsg = PyUnicode_FromFormatV(format, vargs);
1174 va_end(vargs);
1175 if (!errmsg) {
1176 goto error;
1177 }
1178
1179 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1180 tok->lineno, NULL, NULL) < 0) {
1181 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1182 /* Replace the DeprecationWarning exception with a SyntaxError
1183 to get a more accurate error report */
1184 PyErr_Clear();
1185 syntaxerror(tok, "%U", errmsg);
1186 }
1187 goto error;
1188 }
1189 Py_DECREF(errmsg);
1190 return 0;
1191
1192error:
1193 Py_XDECREF(errmsg);
1194 tok->done = E_ERROR;
1195 return -1;
1196}
1197
1198static int
1199lookahead(struct tok_state *tok, const char *test)
1200{
1201 const char *s = test;
1202 int res = 0;
1203 while (1) {
1204 int c = tok_nextc(tok);
1205 if (*s == 0) {
1206 res = !is_potential_identifier_char(c);
1207 }
1208 else if (c == *s) {
1209 s++;
1210 continue;
1211 }
1212
1213 tok_backup(tok, c);
1214 while (s != test) {
1215 tok_backup(tok, *--s);
1216 }
1217 return res;
1218 }
1219}
1220
1221static int
1222verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1223{
1224 /* Emit a deprecation warning only if the numeric literal is immediately
1225 * followed by one of keywords which can occurr after a numeric literal
1226 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1227 * It allows to gradually deprecate existing valid code without adding
1228 * warning before error in most cases of invalid numeric literal (which
1229 * would be confusiong and break existing tests).
1230 * Raise a syntax error with slighly better message than plain
1231 * "invalid syntax" if the numeric literal is immediately followed by
1232 * other keyword or identifier.
1233 */
1234 int r = 0;
1235 if (c == 'a') {
1236 r = lookahead(tok, "nd");
1237 }
1238 else if (c == 'e') {
1239 r = lookahead(tok, "lse");
1240 }
1241 else if (c == 'f') {
1242 r = lookahead(tok, "or");
1243 }
1244 else if (c == 'i') {
1245 int c2 = tok_nextc(tok);
1246 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1247 r = 1;
1248 }
1249 tok_backup(tok, c2);
1250 }
1251 else if (c == 'o') {
1252 r = lookahead(tok, "r");
1253 }
Miss Islington (bot)f20ac2e2022-02-22 02:00:50 -08001254 else if (c == 'n') {
1255 r = lookahead(tok, "ot");
1256 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001257 if (r) {
1258 tok_backup(tok, c);
1259 if (parser_warn(tok, "invalid %s literal", kind)) {
1260 return 0;
1261 }
1262 tok_nextc(tok);
1263 }
1264 else /* In future releases, only error will remain. */
1265 if (is_potential_identifier_char(c)) {
1266 tok_backup(tok, c);
1267 syntaxerror(tok, "invalid %s literal", kind);
1268 return 0;
1269 }
1270 return 1;
1271}
1272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273/* Verify that the identifier follows PEP 3131.
1274 All identifier strings are guaranteed to be "ready" unicode objects.
1275 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001276static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001277verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001278{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001280 if (tok->decoding_erred)
1281 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001283 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001285 tok->done = E_DECODE;
1286 }
1287 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 tok->done = E_ERROR;
1289 }
1290 return 0;
1291 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001292 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1293 if (invalid < 0) {
1294 Py_DECREF(s);
1295 tok->done = E_ERROR;
1296 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001297 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001298 assert(PyUnicode_GET_LENGTH(s) > 0);
1299 if (invalid < PyUnicode_GET_LENGTH(s)) {
1300 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1301 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1302 /* Determine the offset in UTF-8 encoded input */
1303 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1304 if (s != NULL) {
1305 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1306 }
1307 if (s == NULL) {
1308 tok->done = E_ERROR;
1309 return 0;
1310 }
1311 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1312 }
1313 Py_DECREF(s);
1314 // PyUnicode_FromFormatV() does not support %X
1315 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001316 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001317 if (Py_UNICODE_ISPRINTABLE(ch)) {
1318 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1319 }
1320 else {
1321 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1322 }
1323 return 0;
1324 }
1325 Py_DECREF(s);
1326 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001327}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001328
Brett Cannona721aba2016-09-09 14:57:09 -07001329static int
1330tok_decimal_tail(struct tok_state *tok)
1331{
1332 int c;
1333
1334 while (1) {
1335 do {
1336 c = tok_nextc(tok);
1337 } while (isdigit(c));
1338 if (c != '_') {
1339 break;
1340 }
1341 c = tok_nextc(tok);
1342 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001343 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001344 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001345 return 0;
1346 }
1347 }
1348 return c;
1349}
1350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351/* Get next token, after space stripping etc. */
1352
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001353static inline int
1354tok_continuation_line(struct tok_state *tok) {
1355 int c = tok_nextc(tok);
1356 if (c != '\n') {
1357 tok->done = E_LINECONT;
1358 return -1;
1359 }
1360 c = tok_nextc(tok);
1361 if (c == EOF) {
1362 tok->done = E_EOF;
1363 tok->cur = tok->inp;
1364 return -1;
1365 } else {
1366 tok_backup(tok, c);
1367 }
1368 return c;
1369}
1370
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001371static int
Andy Lester384f3c52020-02-27 20:44:52 -06001372tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001373{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001374 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001376
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001378 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 tok->start = NULL;
1380 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 /* Get indentation level */
1383 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001384 int col = 0;
1385 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386 tok->atbol = 0;
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001387 int cont_line_col = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 for (;;) {
1389 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001390 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001392 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001394 col = (col / tok->tabsize + 1) * tok->tabsize;
1395 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 }
Brett Cannona721aba2016-09-09 14:57:09 -07001397 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001398 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001399 }
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001400 else if (c == '\\') {
1401 // Indentation cannot be split over multiple physical lines
1402 // using backslashes. This means that if we found a backslash
1403 // preceded by whitespace, **the first one we find** determines
1404 // the level of indentation of whatever comes next.
1405 cont_line_col = cont_line_col ? cont_line_col : col;
1406 if ((c = tok_continuation_line(tok)) == -1) {
1407 return ERRORTOKEN;
1408 }
1409 }
Brett Cannona721aba2016-09-09 14:57:09 -07001410 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001412 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 }
1414 tok_backup(tok, c);
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001415 if (c == '#' || c == '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 /* Lines with only whitespace and/or comments
1417 shouldn't affect the indentation and are
1418 not passed to the parser as NEWLINE tokens,
1419 except *totally* empty lines in interactive
1420 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001421 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001423 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001424 else if (tok->prompt != NULL && tok->lineno == 1) {
1425 /* In interactive mode, if the first line contains
1426 only spaces and/or a comment, let it through. */
1427 blankline = 0;
1428 col = altcol = 0;
1429 }
Brett Cannona721aba2016-09-09 14:57:09 -07001430 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001432 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 /* We can't jump back right here since we still
1434 may need to skip to the end of a comment */
1435 }
1436 if (!blankline && tok->level == 0) {
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00001437 col = cont_line_col ? cont_line_col : col;
1438 altcol = cont_line_col ? cont_line_col : altcol;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 if (col == tok->indstack[tok->indent]) {
1440 /* No change */
1441 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001442 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 }
1444 }
1445 else if (col > tok->indstack[tok->indent]) {
1446 /* Indent -- always one */
1447 if (tok->indent+1 >= MAXINDENT) {
1448 tok->done = E_TOODEEP;
1449 tok->cur = tok->inp;
1450 return ERRORTOKEN;
1451 }
1452 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001453 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 }
1455 tok->pendin++;
1456 tok->indstack[++tok->indent] = col;
1457 tok->altindstack[tok->indent] = altcol;
1458 }
1459 else /* col < tok->indstack[tok->indent] */ {
1460 /* Dedent -- any number, must be consistent */
1461 while (tok->indent > 0 &&
1462 col < tok->indstack[tok->indent]) {
1463 tok->pendin--;
1464 tok->indent--;
1465 }
1466 if (col != tok->indstack[tok->indent]) {
1467 tok->done = E_DEDENT;
1468 tok->cur = tok->inp;
1469 return ERRORTOKEN;
1470 }
1471 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001472 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 }
1474 }
1475 }
1476 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001477
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 /* Return pending indents/dedents */
1481 if (tok->pendin != 0) {
1482 if (tok->pendin < 0) {
1483 tok->pendin++;
1484 return DEDENT;
1485 }
1486 else {
1487 tok->pendin--;
1488 return INDENT;
1489 }
1490 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491
Guido van Rossum495da292019-03-07 12:38:08 -08001492 /* Peek ahead at the next character */
1493 c = tok_nextc(tok);
1494 tok_backup(tok, c);
1495 /* Check if we are closing an async function */
1496 if (tok->async_def
1497 && !blankline
1498 /* Due to some implementation artifacts of type comments,
1499 * a TYPE_COMMENT at the start of a function won't set an
1500 * indentation level and it will produce a NEWLINE after it.
1501 * To avoid spuriously ending an async function due to this,
1502 * wait until we have some non-newline char in front of us. */
1503 && c != '\n'
1504 && tok->level == 0
1505 /* There was a NEWLINE after ASYNC DEF,
1506 so we're past the signature. */
1507 && tok->async_def_nl
1508 /* Current indentation level is less than where
1509 the async function was defined */
1510 && tok->async_def_indent >= tok->indent)
1511 {
1512 tok->async_def = 0;
1513 tok->async_def_indent = 0;
1514 tok->async_def_nl = 0;
1515 }
1516
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001517 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001518 tok->start = NULL;
1519 /* Skip spaces */
1520 do {
1521 c = tok_nextc(tok);
1522 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001523
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 /* Set start of current token */
1525 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001526
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001527 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001528 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001529 const char *prefix, *p, *type_start;
1530
Brett Cannona721aba2016-09-09 14:57:09 -07001531 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001533 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001534
1535 if (tok->type_comments) {
1536 p = tok->start;
1537 prefix = type_comment_prefix;
1538 while (*prefix && p < tok->cur) {
1539 if (*prefix == ' ') {
1540 while (*p == ' ' || *p == '\t') {
1541 p++;
1542 }
1543 } else if (*prefix == *p) {
1544 p++;
1545 } else {
1546 break;
1547 }
1548
1549 prefix++;
1550 }
1551
1552 /* This is a type comment if we matched all of type_comment_prefix. */
1553 if (!*prefix) {
1554 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001555 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001556 tok_backup(tok, c); /* don't eat the newline or EOF */
1557
1558 type_start = p;
1559
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001560 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001561 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001562 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001563 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001564 && !(tok->cur > ignore_end
1565 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001566
1567 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001568 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001569 *p_end = tok->cur;
1570
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001571 /* If this type ignore is the only thing on the line, consume the newline also. */
1572 if (blankline) {
1573 tok_nextc(tok);
1574 tok->atbol = 1;
1575 }
1576 return TYPE_IGNORE;
1577 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001578 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001579 *p_end = tok->cur;
1580 return TYPE_COMMENT;
1581 }
1582 }
1583 }
Brett Cannona721aba2016-09-09 14:57:09 -07001584 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001585
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -07001586 if (tok->done == E_INTERACT_STOP) {
1587 return ENDMARKER;
1588 }
1589
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 /* Check for EOF and errors now */
1591 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001592 if (tok->level) {
1593 return ERRORTOKEN;
1594 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1596 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001597
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 /* Identifier (most frequent token!) */
1599 nonascii = 0;
1600 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001601 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001602 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001603 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001604 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001605 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001606 /* Since this is a backwards compatibility support literal we don't
1607 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001608 else if (!(saw_b || saw_u || saw_r || saw_f)
1609 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001610 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001611 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001612 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001613 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001614 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001615 }
1616 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001617 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001618 }
1619 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001620 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001621 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001623 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001625 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 }
1627 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001628 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001630 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 c = tok_nextc(tok);
1632 }
1633 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001634 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001636 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001637
1638 *p_start = tok->start;
1639 *p_end = tok->cur;
1640
Guido van Rossum495da292019-03-07 12:38:08 -08001641 /* async/await parsing block. */
1642 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1643 /* May be an 'async' or 'await' token. For Python 3.7 or
1644 later we recognize them unconditionally. For Python
1645 3.5 or 3.6 we recognize 'async' in front of 'def', and
1646 either one inside of 'async def'. (Technically we
1647 shouldn't recognize these at all for 3.4 or earlier,
1648 but there's no *valid* Python 3.4 code that would be
1649 rejected, and async functions will be rejected in a
1650 later phase.) */
1651 if (!tok->async_hacks || tok->async_def) {
1652 /* Always recognize the keywords. */
1653 if (memcmp(tok->start, "async", 5) == 0) {
1654 return ASYNC;
1655 }
1656 if (memcmp(tok->start, "await", 5) == 0) {
1657 return AWAIT;
1658 }
1659 }
1660 else if (memcmp(tok->start, "async", 5) == 0) {
1661 /* The current token is 'async'.
1662 Look ahead one token to see if that is 'def'. */
1663
1664 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001665 const char *ahead_tok_start = NULL;
1666 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001667 int ahead_tok_kind;
1668
1669 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1670 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1671 &ahead_tok_end);
1672
1673 if (ahead_tok_kind == NAME
1674 && ahead_tok.cur - ahead_tok.start == 3
1675 && memcmp(ahead_tok.start, "def", 3) == 0)
1676 {
1677 /* The next token is going to be 'def', so instead of
1678 returning a plain NAME token, return ASYNC. */
1679 tok->async_def_indent = tok->indent;
1680 tok->async_def = 1;
1681 return ASYNC;
1682 }
1683 }
1684 }
1685
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 return NAME;
1687 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001688
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 /* Newline */
1690 if (c == '\n') {
1691 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001692 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001694 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 *p_start = tok->start;
1696 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1697 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001698 if (tok->async_def) {
1699 /* We're somewhere inside an 'async def' function, and
1700 we've encountered a NEWLINE after its signature. */
1701 tok->async_def_nl = 1;
1702 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001703 return NEWLINE;
1704 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001705
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 /* Period or number starting with period? */
1707 if (c == '.') {
1708 c = tok_nextc(tok);
1709 if (isdigit(c)) {
1710 goto fraction;
1711 } else if (c == '.') {
1712 c = tok_nextc(tok);
1713 if (c == '.') {
1714 *p_start = tok->start;
1715 *p_end = tok->cur;
1716 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001717 }
1718 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 tok_backup(tok, c);
1720 }
1721 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001722 }
1723 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 tok_backup(tok, c);
1725 }
1726 *p_start = tok->start;
1727 *p_end = tok->cur;
1728 return DOT;
1729 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001730
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001731 /* Number */
1732 if (isdigit(c)) {
1733 if (c == '0') {
1734 /* Hex, octal or binary -- maybe. */
1735 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 /* Hex */
1738 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001739 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001740 if (c == '_') {
1741 c = tok_nextc(tok);
1742 }
1743 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001744 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001745 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001746 }
1747 do {
1748 c = tok_nextc(tok);
1749 } while (isxdigit(c));
1750 } while (c == '_');
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001751 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1752 return ERRORTOKEN;
1753 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 }
1755 else if (c == 'o' || c == 'O') {
1756 /* Octal */
1757 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001759 if (c == '_') {
1760 c = tok_nextc(tok);
1761 }
1762 if (c < '0' || c >= '8') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001763 if (isdigit(c)) {
1764 return syntaxerror(tok,
1765 "invalid digit '%c' in octal literal", c);
1766 }
1767 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001768 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001769 return syntaxerror(tok, "invalid octal literal");
1770 }
Brett Cannona721aba2016-09-09 14:57:09 -07001771 }
1772 do {
1773 c = tok_nextc(tok);
1774 } while ('0' <= c && c < '8');
1775 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001776 if (isdigit(c)) {
1777 return syntaxerror(tok,
1778 "invalid digit '%c' in octal literal", c);
1779 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001780 if (!verify_end_of_number(tok, c, "octal")) {
1781 return ERRORTOKEN;
1782 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 }
1784 else if (c == 'b' || c == 'B') {
1785 /* Binary */
1786 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001787 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001788 if (c == '_') {
1789 c = tok_nextc(tok);
1790 }
1791 if (c != '0' && c != '1') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001792 if (isdigit(c)) {
1793 return syntaxerror(tok,
1794 "invalid digit '%c' in binary literal", c);
1795 }
1796 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001797 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001798 return syntaxerror(tok, "invalid binary literal");
1799 }
Brett Cannona721aba2016-09-09 14:57:09 -07001800 }
1801 do {
1802 c = tok_nextc(tok);
1803 } while (c == '0' || c == '1');
1804 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001805 if (isdigit(c)) {
1806 return syntaxerror(tok,
1807 "invalid digit '%c' in binary literal", c);
1808 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001809 if (!verify_end_of_number(tok, c, "binary")) {
1810 return ERRORTOKEN;
1811 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001812 }
1813 else {
1814 int nonzero = 0;
1815 /* maybe old-style octal; c is first char of it */
1816 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001817 while (1) {
1818 if (c == '_') {
1819 c = tok_nextc(tok);
1820 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001821 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001822 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001823 }
1824 }
1825 if (c != '0') {
1826 break;
1827 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 c = tok_nextc(tok);
1829 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001830 char* zeros_end = tok->cur;
Brett Cannona721aba2016-09-09 14:57:09 -07001831 if (isdigit(c)) {
1832 nonzero = 1;
1833 c = tok_decimal_tail(tok);
1834 if (c == 0) {
1835 return ERRORTOKEN;
1836 }
1837 }
1838 if (c == '.') {
1839 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001840 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001841 }
1842 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001843 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001844 }
1845 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001846 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001847 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001848 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001849 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 tok_backup(tok, c);
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001851 return syntaxerror_known_range(
1852 tok, (int)(tok->start + 1 - tok->line_start),
1853 (int)(zeros_end - tok->line_start),
1854 "leading zeros in decimal integer "
1855 "literals are not permitted; "
1856 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001857 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001858 if (!verify_end_of_number(tok, c, "decimal")) {
1859 return ERRORTOKEN;
1860 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001861 }
1862 }
1863 else {
1864 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001865 c = tok_decimal_tail(tok);
1866 if (c == 0) {
1867 return ERRORTOKEN;
1868 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001869 {
1870 /* Accept floating point numbers. */
1871 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001872 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001873 fraction:
1874 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001875 if (isdigit(c)) {
1876 c = tok_decimal_tail(tok);
1877 if (c == 0) {
1878 return ERRORTOKEN;
1879 }
1880 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001881 }
1882 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001883 int e;
1884 exponent:
1885 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001886 /* Exponent part */
1887 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001888 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001889 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001890 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001891 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001892 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001893 }
1894 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001895 tok_backup(tok, c);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001896 if (!verify_end_of_number(tok, e, "decimal")) {
1897 return ERRORTOKEN;
1898 }
Benjamin Petersonc4161622014-06-07 12:36:39 -07001899 tok_backup(tok, e);
1900 *p_start = tok->start;
1901 *p_end = tok->cur;
1902 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001903 }
Brett Cannona721aba2016-09-09 14:57:09 -07001904 c = tok_decimal_tail(tok);
1905 if (c == 0) {
1906 return ERRORTOKEN;
1907 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001908 }
Brett Cannona721aba2016-09-09 14:57:09 -07001909 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001910 /* Imaginary part */
1911 imaginary:
1912 c = tok_nextc(tok);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001913 if (!verify_end_of_number(tok, c, "imaginary")) {
1914 return ERRORTOKEN;
1915 }
1916 }
1917 else if (!verify_end_of_number(tok, c, "decimal")) {
1918 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001919 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001920 }
1921 }
1922 tok_backup(tok, c);
1923 *p_start = tok->start;
1924 *p_end = tok->cur;
1925 return NUMBER;
1926 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001927
1928 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001929 /* String */
1930 if (c == '\'' || c == '"') {
1931 int quote = c;
1932 int quote_size = 1; /* 1 or 3 */
1933 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001934
Anthony Sottile995d9b92019-01-12 20:05:13 -08001935 /* Nodes of type STRING, especially multi line strings
1936 must be handled differently in order to get both
1937 the starting line number and the column offset right.
1938 (cf. issue 16806) */
1939 tok->first_lineno = tok->lineno;
1940 tok->multi_line_start = tok->line_start;
1941
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001942 /* Find the quote size and start of string */
1943 c = tok_nextc(tok);
1944 if (c == quote) {
1945 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001946 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001947 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001948 }
1949 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001950 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001951 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001952 }
Brett Cannona721aba2016-09-09 14:57:09 -07001953 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001954 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001955 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001956
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001957 /* Get rest of string */
1958 while (end_quote_size != quote_size) {
1959 c = tok_nextc(tok);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001960 if (c == EOF || (quote_size == 1 && c == '\n')) {
Miss Islington (bot)d03f3422021-06-12 13:27:02 -07001961 assert(tok->multi_line_start != NULL);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001962 // shift the tok_state's location into
1963 // the start of string, and report the error
1964 // from the initial quote character
1965 tok->cur = (char *)tok->start;
1966 tok->cur++;
1967 tok->line_start = tok->multi_line_start;
1968 int start = tok->lineno;
1969 tok->lineno = tok->first_lineno;
Brett Cannona721aba2016-09-09 14:57:09 -07001970 if (quote_size == 3) {
Pablo Galindo Salgado5b58db72022-02-08 12:25:15 +00001971 syntaxerror(tok, "unterminated triple-quoted string literal"
1972 " (detected at line %d)", start);
1973 if (c != '\n') {
1974 tok->done = E_EOFS;
1975 }
1976 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001977 }
1978 else {
Pablo Galindo Salgado5b58db72022-02-08 12:25:15 +00001979 syntaxerror(tok, "unterminated string literal (detected at"
1980 " line %d)", start);
1981 if (c != '\n') {
1982 tok->done = E_EOLS;
1983 }
1984 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001985 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001986 }
Brett Cannona721aba2016-09-09 14:57:09 -07001987 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001988 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001989 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001990 else {
1991 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001992 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001993 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001994 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001995 }
1996 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001997
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001998 *p_start = tok->start;
1999 *p_end = tok->cur;
2000 return STRING;
2001 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002003 /* Line continuation */
2004 if (c == '\\') {
Pablo Galindo Salgado3fc8b742022-01-25 22:33:57 +00002005 if ((c = tok_continuation_line(tok)) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002006 return ERRORTOKEN;
2007 }
2008 tok->cont_line = 1;
2009 goto again; /* Read next line */
2010 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002011
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002012 /* Check for two-character token */
2013 {
2014 int c2 = tok_nextc(tok);
2015 int token = PyToken_TwoChars(c, c2);
2016 if (token != OP) {
2017 int c3 = tok_nextc(tok);
2018 int token3 = PyToken_ThreeChars(c, c2, c3);
2019 if (token3 != OP) {
2020 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07002021 }
2022 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002023 tok_backup(tok, c3);
2024 }
2025 *p_start = tok->start;
2026 *p_end = tok->cur;
2027 return token;
2028 }
2029 tok_backup(tok, c2);
2030 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002031
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002032 /* Keep track of parentheses nesting level */
2033 switch (c) {
2034 case '(':
2035 case '[':
2036 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002037 if (tok->level >= MAXLEVEL) {
2038 return syntaxerror(tok, "too many nested parentheses");
2039 }
2040 tok->parenstack[tok->level] = c;
2041 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindoae7d3cd92021-01-20 12:53:52 +00002042 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002043 tok->level++;
2044 break;
2045 case ')':
2046 case ']':
2047 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002048 if (!tok->level) {
2049 return syntaxerror(tok, "unmatched '%c'", c);
2050 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002051 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002052 int opening = tok->parenstack[tok->level];
2053 if (!((opening == '(' && c == ')') ||
2054 (opening == '[' && c == ']') ||
2055 (opening == '{' && c == '}')))
2056 {
2057 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2058 return syntaxerror(tok,
2059 "closing parenthesis '%c' does not match "
2060 "opening parenthesis '%c' on line %d",
2061 c, opening, tok->parenlinenostack[tok->level]);
2062 }
2063 else {
2064 return syntaxerror(tok,
2065 "closing parenthesis '%c' does not match "
2066 "opening parenthesis '%c'",
2067 c, opening);
2068 }
2069 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002070 break;
2071 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002072
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002073 /* Punctuation character */
2074 *p_start = tok->start;
2075 *p_end = tok->cur;
2076 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002077}
2078
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002079int
Andy Lester384f3c52020-02-27 20:44:52 -06002080PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002081{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002082 int result = tok_get(tok, p_start, p_end);
2083 if (tok->decoding_erred) {
2084 result = ERRORTOKEN;
2085 tok->done = E_DECODE;
2086 }
2087 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002088}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002089
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002090/* Get the encoding of a Python file. Check for the coding cookie and check if
2091 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002092
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002093 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2094 encoding in the first or second line of the file (in which case the encoding
2095 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00002096
Victor Stinner00d7abd2020-12-01 09:56:42 +01002097 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002098 by the caller. */
2099
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002100char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002101PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00002102{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002103 struct tok_state *tok;
2104 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06002105 const char *p_start = NULL;
2106 const char *p_end = NULL;
2107 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002108
Victor Stinnerdaf45552013-08-28 00:53:59 +02002109 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002110 if (fd < 0) {
2111 return NULL;
2112 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02002113
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002114 fp = fdopen(fd, "r");
2115 if (fp == NULL) {
2116 return NULL;
2117 }
2118 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2119 if (tok == NULL) {
2120 fclose(fp);
2121 return NULL;
2122 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002123 if (filename != NULL) {
2124 Py_INCREF(filename);
2125 tok->filename = filename;
2126 }
2127 else {
2128 tok->filename = PyUnicode_FromString("<string>");
2129 if (tok->filename == NULL) {
2130 fclose(fp);
2131 PyTokenizer_Free(tok);
2132 return encoding;
2133 }
2134 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002135 while (tok->lineno < 2 && tok->done == E_OK) {
2136 PyTokenizer_Get(tok, &p_start, &p_end);
2137 }
2138 fclose(fp);
2139 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01002140 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Pablo Galindo261a4522021-03-28 23:48:05 +01002141 if (encoding) {
Hansraj Das69f37bc2019-08-15 21:49:07 +05302142 strcpy(encoding, tok->encoding);
Pablo Galindo261a4522021-03-28 23:48:05 +01002143 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002144 }
2145 PyTokenizer_Free(tok);
2146 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002147}
Thomas Wouters89d996e2007-09-08 17:39:28 +00002148
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002149char *
2150PyTokenizer_FindEncoding(int fd)
2151{
2152 return PyTokenizer_FindEncodingFilename(fd, NULL);
2153}
2154
Guido van Rossum408027e1996-12-30 16:17:54 +00002155#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002156
2157void
Thomas Wouters23c9e002000-07-22 19:20:54 +00002158tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002159{
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002160 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002161 if (type == NAME || type == NUMBER || type == STRING || type == OP)
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002162 fprintf(stderr, "(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002163}
2164
2165#endif