blob: 672fdb92ec86f8adec8078f632858da12a214352 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034
Guido van Rossum4fe87291992-02-26 15:24:44 +000035/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Brett Cannond5ec98c2007-10-20 02:54:14 +000043
Guido van Rossumdcfcd142019-01-31 03:40:27 -080044/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000051tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052{
Victor Stinner00d7abd2020-12-01 09:56:42 +010053 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060057 tok->buf = tok->cur = tok->inp = NULL;
Pablo Galindocd8dcbc2021-03-14 04:38:40 +010058 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060061 tok->start = NULL;
62 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Guido van Rossum495da292019-03-07 12:38:08 -080084 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -070088 tok->interactive_underflow = IUNDERFLOW_NORMAL;
Pablo Galindo Salgado07cf66f2021-11-21 04:15:22 +000089 tok->str = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000091}
92
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000093static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070094new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095{
Victor Stinner00d7abd2020-12-01 09:56:42 +010096 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 memcpy(result, s, len);
102 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000104}
105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100111 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
Pablo Galindo261a4522021-03-28 23:48:05 +0100166 if (memcmp(t, "coding", 6) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
Pablo Galindo261a4522021-03-28 23:48:05 +0100173 } while (t[0] == ' ' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100187 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200210 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 /* It's a continuation line, so it can't be a coding spec. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100212 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100215 if (!get_coding_spec(line, &cs, size, tok)) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700216 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100217 }
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100226 tok->decoding_state = STATE_NORMAL;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200227 break;
228 }
229 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200231 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100232 tok->decoding_state = STATE_NORMAL;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (tok->encoding == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 } else { /* then, compare cs with BOM */
Pablo Galindo261a4522021-03-28 23:48:05 +0100243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
Pablo Galindo261a4522021-03-28 23:48:05 +0100247 PyMem_Free(cs);
248 return 0;
249 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100250 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100252 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253}
254
255/* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
260check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
Pablo Galindo261a4522021-03-28 23:48:05 +0100267 tok->decoding_state = STATE_SEEK_CODING;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000284#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100313 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 /* No need to set_readline: input is already utf-8 */
318 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319}
320
Pablo Galindo261a4522021-03-28 23:48:05 +0100321static int
322tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348}
349
350
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 stored the result in tok->decoding_buffer
Pablo Galindo261a4522021-03-28 23:48:05 +0100358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
Pablo Galindo261a4522021-03-28 23:48:05 +0100361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Pablo Galindo261a4522021-03-28 23:48:05 +0100363 reached): see tok_nextc and its calls to tok_reserve_buf.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
Pablo Galindo261a4522021-03-28 23:48:05 +0100366static int
367tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368{
Pablo Galindo261a4522021-03-28 23:48:05 +0100369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
Pablo Galindo261a4522021-03-28 23:48:05 +0100377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
Pablo Galindo261a4522021-03-28 23:48:05 +0100389 }
390 return 1;
391}
392
393static int
394tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 Py_ssize_t buflen;
Pablo Galindo261a4522021-03-28 23:48:05 +0100398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 goto error;
404 }
405 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100406 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 tok->decoding_buffer = NULL;
Pablo Galindo261a4522021-03-28 23:48:05 +0100408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426error:
Pablo Galindo261a4522021-03-28 23:48:05 +0100427 Py_XDECREF(line);
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429}
430
431/* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441static int
442fp_setreadl(struct tok_state *tok, const char* enc)
443{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000447 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200448 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
Victor Stinner22a351a2010-10-14 12:04:34 +0000450 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200451 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200456 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700460 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000461 }
462
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700469 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700471 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300477 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700478
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100479 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100480 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100484 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700486 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487}
488
489/* Fetch the next byte from TOK. */
490
491static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493}
494
495/* Unfetch the last byte back into TOK. */
496
497static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499}
500
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000501/* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
504static int valid_utf8(const unsigned char* s)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000527}
528
Pablo Galindo261a4522021-03-28 23:48:05 +0100529static int
530ensure_utf8(char *line, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 int badchar = 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000538 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000539 }
540 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
Pablo Galindo261a4522021-03-28 23:48:05 +0100543 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200544 PyErr_Format(PyExc_SyntaxError,
Pablo Galindo261a4522021-03-28 23:48:05 +0100545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
Miss Islington (bot)f7f1c262021-07-30 07:25:28 -0700548 "see https://python.org/dev/peps/pep-0263/ for details",
Pablo Galindo261a4522021-03-28 23:48:05 +0100549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100552 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Fetch a byte from TOK, using the string buffer. */
556
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000557static int
558buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562/* Unfetch a byte from TOK, using the string buffer. */
563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000564static void
565buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568}
569
570/* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000573static int
574buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 tok->enc = enc;
576 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577}
578
579/* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582static PyObject *
583translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000593
594static char *
595translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 char *buf, *current;
599 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100600 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000629 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100631 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000632 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100633 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000634 }
635 buf = result;
636 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000638}
639
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640/* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
Andy Lester384f3c52020-02-27 20:44:52 -0600644static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000645decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600648 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
Pablo Galindo261a4522021-03-28 23:48:05 +0100686 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699}
700
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701/* Set up tokenizer for string */
702
703struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600707 char *decoded;
708
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (tok == NULL)
710 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyTokenizer_Free(tok);
714 return NULL;
715 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716
Andy Lester384f3c52020-02-27 20:44:52 -0600717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720}
721
Pablo Galindo261a4522021-03-28 23:48:05 +0100722/* Set up tokenizer for UTF-8 string */
723
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000724struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000725PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600728 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 if (tok == NULL)
730 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyTokenizer_Free(tok);
734 return NULL;
735 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100736 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600738 tok->str = translated;
Pablo Galindo261a4522021-03-28 23:48:05 +0100739 tok->encoding = new_string("utf-8", 5, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744
Andy Lester384f3c52020-02-27 20:44:52 -0600745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748}
749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300753PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100771 tok->encoding = new_string(enc, strlen(enc), tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779}
780
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781/* Free a tok_state structure */
782
783void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000784PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785{
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100786 if (tok->encoding != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100787 PyMem_Free(tok->encoding);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100788 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200791 Py_XDECREF(tok->filename);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100792 if (tok->fp != NULL && tok->buf != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100793 PyMem_Free(tok->buf);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100794 }
795 if (tok->input) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100796 PyMem_Free(tok->input);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100801 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
Pablo Galindo261a4522021-03-28 23:48:05 +0100804static int
805tok_readline_raw(struct tok_state *tok)
806{
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100821 if (*tok->inp == '\0') {
822 return 0;
823 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100824 tok->inp = strchr(tok->inp, '\0');
825 } while (tok->inp[-1] != '\n');
826 return 1;
827}
828
829static int
830tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849}
850
851static int
852tok_underflow_interactive(struct tok_state *tok) {
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -0700853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100857 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933}
934
935static int
936tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Pablo Galindo261a4522021-03-28 23:48:05 +0100979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
986 if (!tok->encoding
987 && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
988 if (!ensure_utf8(tok->cur, tok)) {
989 error_ret(tok);
990 return 0;
991 }
992 }
993 assert(tok->done == E_OK);
994 return tok->done == E_OK;
995}
996
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -0700997#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +0100998static void
999print_escape(FILE *f, const char *s, Py_ssize_t size)
1000{
1001 if (s == NULL) {
1002 fputs("NULL", f);
1003 return;
1004 }
1005 putc('"', f);
1006 while (size-- > 0) {
1007 unsigned char c = *s++;
1008 switch (c) {
1009 case '\n': fputs("\\n", f); break;
1010 case '\r': fputs("\\r", f); break;
1011 case '\t': fputs("\\t", f); break;
1012 case '\f': fputs("\\f", f); break;
1013 case '\'': fputs("\\'", f); break;
1014 case '"': fputs("\\\"", f); break;
1015 default:
1016 if (0x20 <= c && c <= 0x7f)
1017 putc(c, f);
1018 else
1019 fprintf(f, "\\x%02x", c);
1020 }
1021 }
1022 putc('"', f);
1023}
Miss Islington (bot)cadf06e2021-10-23 06:35:48 -07001024#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001025
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001026/* Get next char, updating state; error code goes into tok->done */
1027
1028static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001029tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001030{
Pablo Galindo261a4522021-03-28 23:48:05 +01001031 int rc;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 for (;;) {
1033 if (tok->cur != tok->inp) {
1034 return Py_CHARMASK(*tok->cur++); /* Fast path */
1035 }
1036 if (tok->done != E_OK)
1037 return EOF;
1038 if (tok->fp == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +01001039 rc = tok_underflow_string(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001041 else if (tok->prompt != NULL) {
1042 rc = tok_underflow_interactive(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 }
1044 else {
Pablo Galindo261a4522021-03-28 23:48:05 +01001045 rc = tok_underflow_file(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001047#if defined(Py_DEBUG)
Pablo Galindo261a4522021-03-28 23:48:05 +01001048 if (Py_DebugFlag) {
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001049 fprintf(stderr, "line[%d] = ", tok->lineno);
Miss Islington (bot)d8ca47c2021-10-29 10:21:15 -07001050 print_escape(stderr, tok->cur, tok->inp - tok->cur);
Miss Islington (bot)038f4522021-10-27 14:45:43 -07001051 fprintf(stderr, " tok->done = %d\n", tok->done);
Pablo Galindo261a4522021-03-28 23:48:05 +01001052 }
Miss Islington (bot)ae78ffd2021-10-22 03:14:47 -07001053#endif
Pablo Galindo261a4522021-03-28 23:48:05 +01001054 if (!rc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 tok->cur = tok->inp;
1056 return EOF;
1057 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001058 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001060 Py_UNREACHABLE();
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061}
1062
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001063/* Back-up one character */
1064
1065static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001066tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001067{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001068 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001069 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001070 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001071 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001072 if ((int)(unsigned char)*tok->cur != c) {
1073 Py_FatalError("tok_backup: wrong character");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001074 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001075 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076}
1077
Guido van Rossum926f13a1998-04-09 21:38:06 +00001078static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001079_syntaxerror_range(struct tok_state *tok, const char *format,
1080 int col_offset, int end_col_offset,
1081 va_list vargs)
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001082{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001083 PyObject *errmsg, *errtext, *args;
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001084 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001085 if (!errmsg) {
1086 goto error;
1087 }
1088
1089 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1090 "replace");
1091 if (!errtext) {
1092 goto error;
1093 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001094
1095 if (col_offset == -1) {
1096 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1097 }
1098 if (end_col_offset == -1) {
1099 end_col_offset = col_offset;
1100 }
1101
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001102 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1103 if (line_len != tok->cur - tok->line_start) {
1104 Py_DECREF(errtext);
1105 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1106 "replace");
1107 }
1108 if (!errtext) {
1109 goto error;
1110 }
1111
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001112 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1113 col_offset, errtext, tok->lineno, end_col_offset);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001114 if (args) {
1115 PyErr_SetObject(PyExc_SyntaxError, args);
1116 Py_DECREF(args);
1117 }
1118
1119error:
1120 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001121 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001122 return ERRORTOKEN;
1123}
1124
1125static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001126syntaxerror(struct tok_state *tok, const char *format, ...)
1127{
1128 va_list vargs;
1129#ifdef HAVE_STDARG_PROTOTYPES
1130 va_start(vargs, format);
1131#else
1132 va_start(vargs);
1133#endif
1134 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1135 va_end(vargs);
1136 return ret;
1137}
1138
1139static int
1140syntaxerror_known_range(struct tok_state *tok,
1141 int col_offset, int end_col_offset,
1142 const char *format, ...)
1143{
1144 va_list vargs;
1145#ifdef HAVE_STDARG_PROTOTYPES
1146 va_start(vargs, format);
1147#else
1148 va_start(vargs);
1149#endif
1150 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1151 va_end(vargs);
1152 return ret;
1153}
1154
1155
1156
1157static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001158indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001160 tok->done = E_TABSPACE;
1161 tok->cur = tok->inp;
1162 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001163}
1164
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001165static int
1166parser_warn(struct tok_state *tok, const char *format, ...)
1167{
1168 PyObject *errmsg;
1169 va_list vargs;
1170#ifdef HAVE_STDARG_PROTOTYPES
1171 va_start(vargs, format);
1172#else
1173 va_start(vargs);
1174#endif
1175 errmsg = PyUnicode_FromFormatV(format, vargs);
1176 va_end(vargs);
1177 if (!errmsg) {
1178 goto error;
1179 }
1180
1181 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1182 tok->lineno, NULL, NULL) < 0) {
1183 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1184 /* Replace the DeprecationWarning exception with a SyntaxError
1185 to get a more accurate error report */
1186 PyErr_Clear();
1187 syntaxerror(tok, "%U", errmsg);
1188 }
1189 goto error;
1190 }
1191 Py_DECREF(errmsg);
1192 return 0;
1193
1194error:
1195 Py_XDECREF(errmsg);
1196 tok->done = E_ERROR;
1197 return -1;
1198}
1199
1200static int
1201lookahead(struct tok_state *tok, const char *test)
1202{
1203 const char *s = test;
1204 int res = 0;
1205 while (1) {
1206 int c = tok_nextc(tok);
1207 if (*s == 0) {
1208 res = !is_potential_identifier_char(c);
1209 }
1210 else if (c == *s) {
1211 s++;
1212 continue;
1213 }
1214
1215 tok_backup(tok, c);
1216 while (s != test) {
1217 tok_backup(tok, *--s);
1218 }
1219 return res;
1220 }
1221}
1222
1223static int
1224verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1225{
1226 /* Emit a deprecation warning only if the numeric literal is immediately
1227 * followed by one of keywords which can occurr after a numeric literal
1228 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1229 * It allows to gradually deprecate existing valid code without adding
1230 * warning before error in most cases of invalid numeric literal (which
1231 * would be confusiong and break existing tests).
1232 * Raise a syntax error with slighly better message than plain
1233 * "invalid syntax" if the numeric literal is immediately followed by
1234 * other keyword or identifier.
1235 */
1236 int r = 0;
1237 if (c == 'a') {
1238 r = lookahead(tok, "nd");
1239 }
1240 else if (c == 'e') {
1241 r = lookahead(tok, "lse");
1242 }
1243 else if (c == 'f') {
1244 r = lookahead(tok, "or");
1245 }
1246 else if (c == 'i') {
1247 int c2 = tok_nextc(tok);
1248 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1249 r = 1;
1250 }
1251 tok_backup(tok, c2);
1252 }
1253 else if (c == 'o') {
1254 r = lookahead(tok, "r");
1255 }
1256 if (r) {
1257 tok_backup(tok, c);
1258 if (parser_warn(tok, "invalid %s literal", kind)) {
1259 return 0;
1260 }
1261 tok_nextc(tok);
1262 }
1263 else /* In future releases, only error will remain. */
1264 if (is_potential_identifier_char(c)) {
1265 tok_backup(tok, c);
1266 syntaxerror(tok, "invalid %s literal", kind);
1267 return 0;
1268 }
1269 return 1;
1270}
1271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272/* Verify that the identifier follows PEP 3131.
1273 All identifier strings are guaranteed to be "ready" unicode objects.
1274 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001275static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001276verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001277{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001279 if (tok->decoding_erred)
1280 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001281 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001282 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001284 tok->done = E_DECODE;
1285 }
1286 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 tok->done = E_ERROR;
1288 }
1289 return 0;
1290 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001291 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1292 if (invalid < 0) {
1293 Py_DECREF(s);
1294 tok->done = E_ERROR;
1295 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001296 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001297 assert(PyUnicode_GET_LENGTH(s) > 0);
1298 if (invalid < PyUnicode_GET_LENGTH(s)) {
1299 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1300 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1301 /* Determine the offset in UTF-8 encoded input */
1302 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1303 if (s != NULL) {
1304 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1305 }
1306 if (s == NULL) {
1307 tok->done = E_ERROR;
1308 return 0;
1309 }
1310 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1311 }
1312 Py_DECREF(s);
1313 // PyUnicode_FromFormatV() does not support %X
1314 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001315 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001316 if (Py_UNICODE_ISPRINTABLE(ch)) {
1317 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1318 }
1319 else {
1320 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1321 }
1322 return 0;
1323 }
1324 Py_DECREF(s);
1325 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001326}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001327
Brett Cannona721aba2016-09-09 14:57:09 -07001328static int
1329tok_decimal_tail(struct tok_state *tok)
1330{
1331 int c;
1332
1333 while (1) {
1334 do {
1335 c = tok_nextc(tok);
1336 } while (isdigit(c));
1337 if (c != '_') {
1338 break;
1339 }
1340 c = tok_nextc(tok);
1341 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001342 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001343 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001344 return 0;
1345 }
1346 }
1347 return c;
1348}
1349
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350/* Get next token, after space stripping etc. */
1351
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001352static int
Andy Lester384f3c52020-02-27 20:44:52 -06001353tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001355 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001357
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001358 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001359 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 tok->start = NULL;
1361 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001362
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001363 /* Get indentation level */
1364 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001365 int col = 0;
1366 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 tok->atbol = 0;
1368 for (;;) {
1369 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001370 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001372 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001374 col = (col / tok->tabsize + 1) * tok->tabsize;
1375 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 }
Brett Cannona721aba2016-09-09 14:57:09 -07001377 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001379 }
1380 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001382 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 }
1384 tok_backup(tok, c);
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001385 if (c == '#' || c == '\n' || c == '\\') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386 /* Lines with only whitespace and/or comments
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001387 and/or a line continuation character
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 shouldn't affect the indentation and are
1389 not passed to the parser as NEWLINE tokens,
1390 except *totally* empty lines in interactive
1391 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001392 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001394 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001395 else if (tok->prompt != NULL && tok->lineno == 1) {
1396 /* In interactive mode, if the first line contains
1397 only spaces and/or a comment, let it through. */
1398 blankline = 0;
1399 col = altcol = 0;
1400 }
Brett Cannona721aba2016-09-09 14:57:09 -07001401 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001403 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 /* We can't jump back right here since we still
1405 may need to skip to the end of a comment */
1406 }
1407 if (!blankline && tok->level == 0) {
1408 if (col == tok->indstack[tok->indent]) {
1409 /* No change */
1410 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001411 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 }
1413 }
1414 else if (col > tok->indstack[tok->indent]) {
1415 /* Indent -- always one */
1416 if (tok->indent+1 >= MAXINDENT) {
1417 tok->done = E_TOODEEP;
1418 tok->cur = tok->inp;
1419 return ERRORTOKEN;
1420 }
1421 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001422 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 }
1424 tok->pendin++;
1425 tok->indstack[++tok->indent] = col;
1426 tok->altindstack[tok->indent] = altcol;
1427 }
1428 else /* col < tok->indstack[tok->indent] */ {
1429 /* Dedent -- any number, must be consistent */
1430 while (tok->indent > 0 &&
1431 col < tok->indstack[tok->indent]) {
1432 tok->pendin--;
1433 tok->indent--;
1434 }
1435 if (col != tok->indstack[tok->indent]) {
1436 tok->done = E_DEDENT;
1437 tok->cur = tok->inp;
1438 return ERRORTOKEN;
1439 }
1440 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001441 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 }
1443 }
1444 }
1445 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 /* Return pending indents/dedents */
1450 if (tok->pendin != 0) {
1451 if (tok->pendin < 0) {
1452 tok->pendin++;
1453 return DEDENT;
1454 }
1455 else {
1456 tok->pendin--;
1457 return INDENT;
1458 }
1459 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460
Guido van Rossum495da292019-03-07 12:38:08 -08001461 /* Peek ahead at the next character */
1462 c = tok_nextc(tok);
1463 tok_backup(tok, c);
1464 /* Check if we are closing an async function */
1465 if (tok->async_def
1466 && !blankline
1467 /* Due to some implementation artifacts of type comments,
1468 * a TYPE_COMMENT at the start of a function won't set an
1469 * indentation level and it will produce a NEWLINE after it.
1470 * To avoid spuriously ending an async function due to this,
1471 * wait until we have some non-newline char in front of us. */
1472 && c != '\n'
1473 && tok->level == 0
1474 /* There was a NEWLINE after ASYNC DEF,
1475 so we're past the signature. */
1476 && tok->async_def_nl
1477 /* Current indentation level is less than where
1478 the async function was defined */
1479 && tok->async_def_indent >= tok->indent)
1480 {
1481 tok->async_def = 0;
1482 tok->async_def_indent = 0;
1483 tok->async_def_nl = 0;
1484 }
1485
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001487 tok->start = NULL;
1488 /* Skip spaces */
1489 do {
1490 c = tok_nextc(tok);
1491 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Set start of current token */
1494 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001495
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001496 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001497 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001498 const char *prefix, *p, *type_start;
1499
Brett Cannona721aba2016-09-09 14:57:09 -07001500 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001502 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001503
1504 if (tok->type_comments) {
1505 p = tok->start;
1506 prefix = type_comment_prefix;
1507 while (*prefix && p < tok->cur) {
1508 if (*prefix == ' ') {
1509 while (*p == ' ' || *p == '\t') {
1510 p++;
1511 }
1512 } else if (*prefix == *p) {
1513 p++;
1514 } else {
1515 break;
1516 }
1517
1518 prefix++;
1519 }
1520
1521 /* This is a type comment if we matched all of type_comment_prefix. */
1522 if (!*prefix) {
1523 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001524 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001525 tok_backup(tok, c); /* don't eat the newline or EOF */
1526
1527 type_start = p;
1528
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001529 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001530 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001531 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001532 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001533 && !(tok->cur > ignore_end
1534 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001535
1536 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001537 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001538 *p_end = tok->cur;
1539
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001540 /* If this type ignore is the only thing on the line, consume the newline also. */
1541 if (blankline) {
1542 tok_nextc(tok);
1543 tok->atbol = 1;
1544 }
1545 return TYPE_IGNORE;
1546 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001547 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001548 *p_end = tok->cur;
1549 return TYPE_COMMENT;
1550 }
1551 }
1552 }
Brett Cannona721aba2016-09-09 14:57:09 -07001553 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001554
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -07001555 if (tok->done == E_INTERACT_STOP) {
1556 return ENDMARKER;
1557 }
1558
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 /* Check for EOF and errors now */
1560 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001561 if (tok->level) {
1562 return ERRORTOKEN;
1563 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001564 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1565 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 /* Identifier (most frequent token!) */
1568 nonascii = 0;
1569 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001570 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001571 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001572 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001573 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001574 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001575 /* Since this is a backwards compatibility support literal we don't
1576 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001577 else if (!(saw_b || saw_u || saw_r || saw_f)
1578 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001579 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001580 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001581 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001582 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001583 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001584 }
1585 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001586 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001587 }
1588 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001589 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001590 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001592 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001594 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 }
1596 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001597 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001599 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 c = tok_nextc(tok);
1601 }
1602 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001603 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001605 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001606
1607 *p_start = tok->start;
1608 *p_end = tok->cur;
1609
Guido van Rossum495da292019-03-07 12:38:08 -08001610 /* async/await parsing block. */
1611 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1612 /* May be an 'async' or 'await' token. For Python 3.7 or
1613 later we recognize them unconditionally. For Python
1614 3.5 or 3.6 we recognize 'async' in front of 'def', and
1615 either one inside of 'async def'. (Technically we
1616 shouldn't recognize these at all for 3.4 or earlier,
1617 but there's no *valid* Python 3.4 code that would be
1618 rejected, and async functions will be rejected in a
1619 later phase.) */
1620 if (!tok->async_hacks || tok->async_def) {
1621 /* Always recognize the keywords. */
1622 if (memcmp(tok->start, "async", 5) == 0) {
1623 return ASYNC;
1624 }
1625 if (memcmp(tok->start, "await", 5) == 0) {
1626 return AWAIT;
1627 }
1628 }
1629 else if (memcmp(tok->start, "async", 5) == 0) {
1630 /* The current token is 'async'.
1631 Look ahead one token to see if that is 'def'. */
1632
1633 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001634 const char *ahead_tok_start = NULL;
1635 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001636 int ahead_tok_kind;
1637
1638 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1639 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1640 &ahead_tok_end);
1641
1642 if (ahead_tok_kind == NAME
1643 && ahead_tok.cur - ahead_tok.start == 3
1644 && memcmp(ahead_tok.start, "def", 3) == 0)
1645 {
1646 /* The next token is going to be 'def', so instead of
1647 returning a plain NAME token, return ASYNC. */
1648 tok->async_def_indent = tok->indent;
1649 tok->async_def = 1;
1650 return ASYNC;
1651 }
1652 }
1653 }
1654
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 return NAME;
1656 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001657
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 /* Newline */
1659 if (c == '\n') {
1660 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001661 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001663 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 *p_start = tok->start;
1665 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1666 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001667 if (tok->async_def) {
1668 /* We're somewhere inside an 'async def' function, and
1669 we've encountered a NEWLINE after its signature. */
1670 tok->async_def_nl = 1;
1671 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001672 return NEWLINE;
1673 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001674
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 /* Period or number starting with period? */
1676 if (c == '.') {
1677 c = tok_nextc(tok);
1678 if (isdigit(c)) {
1679 goto fraction;
1680 } else if (c == '.') {
1681 c = tok_nextc(tok);
1682 if (c == '.') {
1683 *p_start = tok->start;
1684 *p_end = tok->cur;
1685 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001686 }
1687 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 tok_backup(tok, c);
1689 }
1690 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001691 }
1692 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 tok_backup(tok, c);
1694 }
1695 *p_start = tok->start;
1696 *p_end = tok->cur;
1697 return DOT;
1698 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001699
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 /* Number */
1701 if (isdigit(c)) {
1702 if (c == '0') {
1703 /* Hex, octal or binary -- maybe. */
1704 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 /* Hex */
1707 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001709 if (c == '_') {
1710 c = tok_nextc(tok);
1711 }
1712 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001713 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001714 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001715 }
1716 do {
1717 c = tok_nextc(tok);
1718 } while (isxdigit(c));
1719 } while (c == '_');
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001720 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1721 return ERRORTOKEN;
1722 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 }
1724 else if (c == 'o' || c == 'O') {
1725 /* Octal */
1726 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001728 if (c == '_') {
1729 c = tok_nextc(tok);
1730 }
1731 if (c < '0' || c >= '8') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001732 if (isdigit(c)) {
1733 return syntaxerror(tok,
1734 "invalid digit '%c' in octal literal", c);
1735 }
1736 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001737 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001738 return syntaxerror(tok, "invalid octal literal");
1739 }
Brett Cannona721aba2016-09-09 14:57:09 -07001740 }
1741 do {
1742 c = tok_nextc(tok);
1743 } while ('0' <= c && c < '8');
1744 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001745 if (isdigit(c)) {
1746 return syntaxerror(tok,
1747 "invalid digit '%c' in octal literal", c);
1748 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001749 if (!verify_end_of_number(tok, c, "octal")) {
1750 return ERRORTOKEN;
1751 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 }
1753 else if (c == 'b' || c == 'B') {
1754 /* Binary */
1755 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001757 if (c == '_') {
1758 c = tok_nextc(tok);
1759 }
1760 if (c != '0' && c != '1') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001761 if (isdigit(c)) {
1762 return syntaxerror(tok,
1763 "invalid digit '%c' in binary literal", c);
1764 }
1765 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001766 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001767 return syntaxerror(tok, "invalid binary literal");
1768 }
Brett Cannona721aba2016-09-09 14:57:09 -07001769 }
1770 do {
1771 c = tok_nextc(tok);
1772 } while (c == '0' || c == '1');
1773 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001774 if (isdigit(c)) {
1775 return syntaxerror(tok,
1776 "invalid digit '%c' in binary literal", c);
1777 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001778 if (!verify_end_of_number(tok, c, "binary")) {
1779 return ERRORTOKEN;
1780 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001781 }
1782 else {
1783 int nonzero = 0;
1784 /* maybe old-style octal; c is first char of it */
1785 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001786 while (1) {
1787 if (c == '_') {
1788 c = tok_nextc(tok);
1789 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001790 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001791 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001792 }
1793 }
1794 if (c != '0') {
1795 break;
1796 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001797 c = tok_nextc(tok);
1798 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001799 char* zeros_end = tok->cur;
Brett Cannona721aba2016-09-09 14:57:09 -07001800 if (isdigit(c)) {
1801 nonzero = 1;
1802 c = tok_decimal_tail(tok);
1803 if (c == 0) {
1804 return ERRORTOKEN;
1805 }
1806 }
1807 if (c == '.') {
1808 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001809 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001810 }
1811 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001812 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001813 }
1814 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001815 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001816 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001817 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001818 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001819 tok_backup(tok, c);
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001820 return syntaxerror_known_range(
1821 tok, (int)(tok->start + 1 - tok->line_start),
1822 (int)(zeros_end - tok->line_start),
1823 "leading zeros in decimal integer "
1824 "literals are not permitted; "
1825 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001826 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001827 if (!verify_end_of_number(tok, c, "decimal")) {
1828 return ERRORTOKEN;
1829 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001830 }
1831 }
1832 else {
1833 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001834 c = tok_decimal_tail(tok);
1835 if (c == 0) {
1836 return ERRORTOKEN;
1837 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001838 {
1839 /* Accept floating point numbers. */
1840 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001841 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001842 fraction:
1843 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001844 if (isdigit(c)) {
1845 c = tok_decimal_tail(tok);
1846 if (c == 0) {
1847 return ERRORTOKEN;
1848 }
1849 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 }
1851 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001852 int e;
1853 exponent:
1854 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001855 /* Exponent part */
1856 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001857 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001858 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001859 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001860 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001861 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001862 }
1863 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001864 tok_backup(tok, c);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001865 if (!verify_end_of_number(tok, e, "decimal")) {
1866 return ERRORTOKEN;
1867 }
Benjamin Petersonc4161622014-06-07 12:36:39 -07001868 tok_backup(tok, e);
1869 *p_start = tok->start;
1870 *p_end = tok->cur;
1871 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001872 }
Brett Cannona721aba2016-09-09 14:57:09 -07001873 c = tok_decimal_tail(tok);
1874 if (c == 0) {
1875 return ERRORTOKEN;
1876 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 }
Brett Cannona721aba2016-09-09 14:57:09 -07001878 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001879 /* Imaginary part */
1880 imaginary:
1881 c = tok_nextc(tok);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001882 if (!verify_end_of_number(tok, c, "imaginary")) {
1883 return ERRORTOKEN;
1884 }
1885 }
1886 else if (!verify_end_of_number(tok, c, "decimal")) {
1887 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001888 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001889 }
1890 }
1891 tok_backup(tok, c);
1892 *p_start = tok->start;
1893 *p_end = tok->cur;
1894 return NUMBER;
1895 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001896
1897 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001898 /* String */
1899 if (c == '\'' || c == '"') {
1900 int quote = c;
1901 int quote_size = 1; /* 1 or 3 */
1902 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001903
Anthony Sottile995d9b92019-01-12 20:05:13 -08001904 /* Nodes of type STRING, especially multi line strings
1905 must be handled differently in order to get both
1906 the starting line number and the column offset right.
1907 (cf. issue 16806) */
1908 tok->first_lineno = tok->lineno;
1909 tok->multi_line_start = tok->line_start;
1910
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001911 /* Find the quote size and start of string */
1912 c = tok_nextc(tok);
1913 if (c == quote) {
1914 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001915 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001916 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001917 }
1918 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001919 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001920 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001921 }
Brett Cannona721aba2016-09-09 14:57:09 -07001922 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001923 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001924 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001925
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001926 /* Get rest of string */
1927 while (end_quote_size != quote_size) {
1928 c = tok_nextc(tok);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001929 if (c == EOF || (quote_size == 1 && c == '\n')) {
Miss Islington (bot)d03f3422021-06-12 13:27:02 -07001930 assert(tok->multi_line_start != NULL);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001931 // shift the tok_state's location into
1932 // the start of string, and report the error
1933 // from the initial quote character
1934 tok->cur = (char *)tok->start;
1935 tok->cur++;
1936 tok->line_start = tok->multi_line_start;
1937 int start = tok->lineno;
1938 tok->lineno = tok->first_lineno;
1939
Brett Cannona721aba2016-09-09 14:57:09 -07001940 if (quote_size == 3) {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001941 return syntaxerror(tok,
1942 "unterminated triple-quoted string literal"
1943 " (detected at line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001944 }
1945 else {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001946 return syntaxerror(tok,
1947 "unterminated string literal (detected at"
1948 " line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001949 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001950 }
Brett Cannona721aba2016-09-09 14:57:09 -07001951 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001952 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001953 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001954 else {
1955 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001956 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001957 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001958 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001959 }
1960 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001961
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001962 *p_start = tok->start;
1963 *p_end = tok->cur;
1964 return STRING;
1965 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001966
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001967 /* Line continuation */
1968 if (c == '\\') {
1969 c = tok_nextc(tok);
1970 if (c != '\n') {
1971 tok->done = E_LINECONT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001972 return ERRORTOKEN;
1973 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001974 c = tok_nextc(tok);
1975 if (c == EOF) {
1976 tok->done = E_EOF;
1977 tok->cur = tok->inp;
1978 return ERRORTOKEN;
1979 } else {
1980 tok_backup(tok, c);
1981 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001982 tok->cont_line = 1;
1983 goto again; /* Read next line */
1984 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001985
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001986 /* Check for two-character token */
1987 {
1988 int c2 = tok_nextc(tok);
1989 int token = PyToken_TwoChars(c, c2);
1990 if (token != OP) {
1991 int c3 = tok_nextc(tok);
1992 int token3 = PyToken_ThreeChars(c, c2, c3);
1993 if (token3 != OP) {
1994 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001995 }
1996 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001997 tok_backup(tok, c3);
1998 }
1999 *p_start = tok->start;
2000 *p_end = tok->cur;
2001 return token;
2002 }
2003 tok_backup(tok, c2);
2004 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002005
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002006 /* Keep track of parentheses nesting level */
2007 switch (c) {
2008 case '(':
2009 case '[':
2010 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002011 if (tok->level >= MAXLEVEL) {
2012 return syntaxerror(tok, "too many nested parentheses");
2013 }
2014 tok->parenstack[tok->level] = c;
2015 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindoae7d3cd92021-01-20 12:53:52 +00002016 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002017 tok->level++;
2018 break;
2019 case ')':
2020 case ']':
2021 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002022 if (!tok->level) {
2023 return syntaxerror(tok, "unmatched '%c'", c);
2024 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002025 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002026 int opening = tok->parenstack[tok->level];
2027 if (!((opening == '(' && c == ')') ||
2028 (opening == '[' && c == ']') ||
2029 (opening == '{' && c == '}')))
2030 {
2031 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2032 return syntaxerror(tok,
2033 "closing parenthesis '%c' does not match "
2034 "opening parenthesis '%c' on line %d",
2035 c, opening, tok->parenlinenostack[tok->level]);
2036 }
2037 else {
2038 return syntaxerror(tok,
2039 "closing parenthesis '%c' does not match "
2040 "opening parenthesis '%c'",
2041 c, opening);
2042 }
2043 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002044 break;
2045 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002046
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002047 /* Punctuation character */
2048 *p_start = tok->start;
2049 *p_end = tok->cur;
2050 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002051}
2052
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002053int
Andy Lester384f3c52020-02-27 20:44:52 -06002054PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002055{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002056 int result = tok_get(tok, p_start, p_end);
2057 if (tok->decoding_erred) {
2058 result = ERRORTOKEN;
2059 tok->done = E_DECODE;
2060 }
2061 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002062}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002063
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002064/* Get the encoding of a Python file. Check for the coding cookie and check if
2065 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002066
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002067 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2068 encoding in the first or second line of the file (in which case the encoding
2069 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00002070
Victor Stinner00d7abd2020-12-01 09:56:42 +01002071 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002072 by the caller. */
2073
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002074char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002075PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00002076{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002077 struct tok_state *tok;
2078 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06002079 const char *p_start = NULL;
2080 const char *p_end = NULL;
2081 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002082
Victor Stinnerdaf45552013-08-28 00:53:59 +02002083 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002084 if (fd < 0) {
2085 return NULL;
2086 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02002087
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002088 fp = fdopen(fd, "r");
2089 if (fp == NULL) {
2090 return NULL;
2091 }
2092 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2093 if (tok == NULL) {
2094 fclose(fp);
2095 return NULL;
2096 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002097 if (filename != NULL) {
2098 Py_INCREF(filename);
2099 tok->filename = filename;
2100 }
2101 else {
2102 tok->filename = PyUnicode_FromString("<string>");
2103 if (tok->filename == NULL) {
2104 fclose(fp);
2105 PyTokenizer_Free(tok);
2106 return encoding;
2107 }
2108 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002109 while (tok->lineno < 2 && tok->done == E_OK) {
2110 PyTokenizer_Get(tok, &p_start, &p_end);
2111 }
2112 fclose(fp);
2113 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01002114 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Pablo Galindo261a4522021-03-28 23:48:05 +01002115 if (encoding) {
Hansraj Das69f37bc2019-08-15 21:49:07 +05302116 strcpy(encoding, tok->encoding);
Pablo Galindo261a4522021-03-28 23:48:05 +01002117 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002118 }
2119 PyTokenizer_Free(tok);
2120 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002121}
Thomas Wouters89d996e2007-09-08 17:39:28 +00002122
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002123char *
2124PyTokenizer_FindEncoding(int fd)
2125{
2126 return PyTokenizer_FindEncodingFilename(fd, NULL);
2127}
2128
Guido van Rossum408027e1996-12-30 16:17:54 +00002129#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002130
2131void
Thomas Wouters23c9e002000-07-22 19:20:54 +00002132tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002133{
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002134 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002135 if (type == NAME || type == NUMBER || type == STRING || type == OP)
Miss Islington (bot)038f4522021-10-27 14:45:43 -07002136 fprintf(stderr, "(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002137}
2138
2139#endif