blob: a076d625bbf515413cc7025232b1e53892d8a190 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034
Guido van Rossum4fe87291992-02-26 15:24:44 +000035/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Brett Cannond5ec98c2007-10-20 02:54:14 +000043
Guido van Rossumdcfcd142019-01-31 03:40:27 -080044/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000051tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052{
Victor Stinner00d7abd2020-12-01 09:56:42 +010053 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060057 tok->buf = tok->cur = tok->inp = NULL;
Pablo Galindocd8dcbc2021-03-14 04:38:40 +010058 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060061 tok->start = NULL;
62 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Guido van Rossum495da292019-03-07 12:38:08 -080084 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -070088 tok->interactive_underflow = IUNDERFLOW_NORMAL;
Guido van Rossum495da292019-03-07 12:38:08 -080089
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000091}
92
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000093static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070094new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095{
Victor Stinner00d7abd2020-12-01 09:56:42 +010096 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 memcpy(result, s, len);
102 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000104}
105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100111 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
Pablo Galindo261a4522021-03-28 23:48:05 +0100166 if (memcmp(t, "coding", 6) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
Pablo Galindo261a4522021-03-28 23:48:05 +0100173 } while (t[0] == ' ' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100187 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200210 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 /* It's a continuation line, so it can't be a coding spec. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100212 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100215 if (!get_coding_spec(line, &cs, size, tok)) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700216 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100217 }
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100226 tok->decoding_state = STATE_NORMAL;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200227 break;
228 }
229 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200231 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100232 tok->decoding_state = STATE_NORMAL;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (tok->encoding == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 } else { /* then, compare cs with BOM */
Pablo Galindo261a4522021-03-28 23:48:05 +0100243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
Pablo Galindo261a4522021-03-28 23:48:05 +0100247 PyMem_Free(cs);
248 return 0;
249 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100250 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100252 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000253}
254
255/* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
260check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
Pablo Galindo261a4522021-03-28 23:48:05 +0100267 tok->decoding_state = STATE_SEEK_CODING;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000284#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100313 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 /* No need to set_readline: input is already utf-8 */
318 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319}
320
Pablo Galindo261a4522021-03-28 23:48:05 +0100321static int
322tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348}
349
350
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 stored the result in tok->decoding_buffer
Pablo Galindo261a4522021-03-28 23:48:05 +0100358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
Pablo Galindo261a4522021-03-28 23:48:05 +0100361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Pablo Galindo261a4522021-03-28 23:48:05 +0100363 reached): see tok_nextc and its calls to tok_reserve_buf.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
Pablo Galindo261a4522021-03-28 23:48:05 +0100366static int
367tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368{
Pablo Galindo261a4522021-03-28 23:48:05 +0100369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
Pablo Galindo261a4522021-03-28 23:48:05 +0100377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
Miss Islington (bot)d03f3422021-06-12 13:27:02 -0700387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
Pablo Galindo261a4522021-03-28 23:48:05 +0100389 }
390 return 1;
391}
392
393static int
394tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 Py_ssize_t buflen;
Pablo Galindo261a4522021-03-28 23:48:05 +0100398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 goto error;
404 }
405 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100406 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 tok->decoding_buffer = NULL;
Pablo Galindo261a4522021-03-28 23:48:05 +0100408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426error:
Pablo Galindo261a4522021-03-28 23:48:05 +0100427 Py_XDECREF(line);
428 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429}
430
431/* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441static int
442fp_setreadl(struct tok_state *tok, const char* enc)
443{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000447 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200448 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449
Victor Stinner22a351a2010-10-14 12:04:34 +0000450 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200451 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200456 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700460 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000461 }
462
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700469 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700471 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300477 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700478
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100479 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100480 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100484 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700486 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487}
488
489/* Fetch the next byte from TOK. */
490
491static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493}
494
495/* Unfetch the last byte back into TOK. */
496
497static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499}
500
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000501/* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
504static int valid_utf8(const unsigned char* s)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000527}
528
Pablo Galindo261a4522021-03-28 23:48:05 +0100529static int
530ensure_utf8(char *line, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 int badchar = 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000538 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000539 }
540 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
Pablo Galindo261a4522021-03-28 23:48:05 +0100543 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200544 PyErr_Format(PyExc_SyntaxError,
Pablo Galindo261a4522021-03-28 23:48:05 +0100545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
Miss Islington (bot)f7f1c262021-07-30 07:25:28 -0700548 "see https://python.org/dev/peps/pep-0263/ for details",
Pablo Galindo261a4522021-03-28 23:48:05 +0100549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100552 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Fetch a byte from TOK, using the string buffer. */
556
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000557static int
558buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562/* Unfetch a byte from TOK, using the string buffer. */
563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000564static void
565buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568}
569
570/* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000573static int
574buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 tok->enc = enc;
576 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577}
578
579/* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582static PyObject *
583translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000593
594static char *
595translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 char *buf, *current;
599 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100600 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000629 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100631 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000632 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100633 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000634 }
635 buf = result;
636 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000638}
639
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640/* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
Andy Lester384f3c52020-02-27 20:44:52 -0600644static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000645decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600648 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
Pablo Galindo261a4522021-03-28 23:48:05 +0100686 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699}
700
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701/* Set up tokenizer for string */
702
703struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600707 char *decoded;
708
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (tok == NULL)
710 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyTokenizer_Free(tok);
714 return NULL;
715 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716
Andy Lester384f3c52020-02-27 20:44:52 -0600717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720}
721
Pablo Galindo261a4522021-03-28 23:48:05 +0100722/* Set up tokenizer for UTF-8 string */
723
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000724struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000725PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600728 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 if (tok == NULL)
730 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyTokenizer_Free(tok);
734 return NULL;
735 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100736 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600738 tok->str = translated;
Pablo Galindo261a4522021-03-28 23:48:05 +0100739 tok->encoding = new_string("utf-8", 5, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744
Andy Lester384f3c52020-02-27 20:44:52 -0600745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748}
749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300753PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100771 tok->encoding = new_string(enc, strlen(enc), tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779}
780
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781/* Free a tok_state structure */
782
783void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000784PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785{
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100786 if (tok->encoding != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100787 PyMem_Free(tok->encoding);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100788 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200791 Py_XDECREF(tok->filename);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100792 if (tok->fp != NULL && tok->buf != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100793 PyMem_Free(tok->buf);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100794 }
795 if (tok->input) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100796 PyMem_Free(tok->input);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100801 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
Pablo Galindo261a4522021-03-28 23:48:05 +0100804static int
805tok_readline_raw(struct tok_state *tok)
806{
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100821 if (*tok->inp == '\0') {
822 return 0;
823 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100824 tok->inp = strchr(tok->inp, '\0');
825 } while (tok->inp[-1] != '\n');
826 return 1;
827}
828
829static int
830tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849}
850
851static int
852tok_underflow_interactive(struct tok_state *tok) {
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -0700853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100857 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933}
934
935static int
936tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Pablo Galindo261a4522021-03-28 23:48:05 +0100979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
986 if (!tok->encoding
987 && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
988 if (!ensure_utf8(tok->cur, tok)) {
989 error_ret(tok);
990 return 0;
991 }
992 }
993 assert(tok->done == E_OK);
994 return tok->done == E_OK;
995}
996
997static void
998print_escape(FILE *f, const char *s, Py_ssize_t size)
999{
1000 if (s == NULL) {
1001 fputs("NULL", f);
1002 return;
1003 }
1004 putc('"', f);
1005 while (size-- > 0) {
1006 unsigned char c = *s++;
1007 switch (c) {
1008 case '\n': fputs("\\n", f); break;
1009 case '\r': fputs("\\r", f); break;
1010 case '\t': fputs("\\t", f); break;
1011 case '\f': fputs("\\f", f); break;
1012 case '\'': fputs("\\'", f); break;
1013 case '"': fputs("\\\"", f); break;
1014 default:
1015 if (0x20 <= c && c <= 0x7f)
1016 putc(c, f);
1017 else
1018 fprintf(f, "\\x%02x", c);
1019 }
1020 }
1021 putc('"', f);
1022}
1023
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001024/* Get next char, updating state; error code goes into tok->done */
1025
1026static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001027tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001028{
Pablo Galindo261a4522021-03-28 23:48:05 +01001029 int rc;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 for (;;) {
1031 if (tok->cur != tok->inp) {
1032 return Py_CHARMASK(*tok->cur++); /* Fast path */
1033 }
1034 if (tok->done != E_OK)
1035 return EOF;
1036 if (tok->fp == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +01001037 rc = tok_underflow_string(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001039 else if (tok->prompt != NULL) {
1040 rc = tok_underflow_interactive(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 }
1042 else {
Pablo Galindo261a4522021-03-28 23:48:05 +01001043 rc = tok_underflow_file(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001044 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001045 if (Py_DebugFlag) {
1046 printf("line[%d] = ", tok->lineno);
1047 print_escape(stdout, tok->cur, tok->inp - tok->cur);
1048 printf(" tok->done = %d\n", tok->done);
1049 }
1050 if (!rc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 tok->cur = tok->inp;
1052 return EOF;
1053 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001054 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001056 Py_UNREACHABLE();
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057}
1058
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001059/* Back-up one character */
1060
1061static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001062tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001063{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001065 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001066 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001067 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001068 if ((int)(unsigned char)*tok->cur != c) {
1069 Py_FatalError("tok_backup: wrong character");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001070 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001072}
1073
Guido van Rossum926f13a1998-04-09 21:38:06 +00001074static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001075_syntaxerror_range(struct tok_state *tok, const char *format,
1076 int col_offset, int end_col_offset,
1077 va_list vargs)
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001078{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001079 PyObject *errmsg, *errtext, *args;
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001080 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001081 if (!errmsg) {
1082 goto error;
1083 }
1084
1085 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1086 "replace");
1087 if (!errtext) {
1088 goto error;
1089 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001090
1091 if (col_offset == -1) {
1092 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1093 }
1094 if (end_col_offset == -1) {
1095 end_col_offset = col_offset;
1096 }
1097
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001098 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1099 if (line_len != tok->cur - tok->line_start) {
1100 Py_DECREF(errtext);
1101 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1102 "replace");
1103 }
1104 if (!errtext) {
1105 goto error;
1106 }
1107
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001108 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1109 col_offset, errtext, tok->lineno, end_col_offset);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001110 if (args) {
1111 PyErr_SetObject(PyExc_SyntaxError, args);
1112 Py_DECREF(args);
1113 }
1114
1115error:
1116 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001117 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001118 return ERRORTOKEN;
1119}
1120
1121static int
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001122syntaxerror(struct tok_state *tok, const char *format, ...)
1123{
1124 va_list vargs;
1125#ifdef HAVE_STDARG_PROTOTYPES
1126 va_start(vargs, format);
1127#else
1128 va_start(vargs);
1129#endif
1130 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1131 va_end(vargs);
1132 return ret;
1133}
1134
1135static int
1136syntaxerror_known_range(struct tok_state *tok,
1137 int col_offset, int end_col_offset,
1138 const char *format, ...)
1139{
1140 va_list vargs;
1141#ifdef HAVE_STDARG_PROTOTYPES
1142 va_start(vargs, format);
1143#else
1144 va_start(vargs);
1145#endif
1146 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1147 va_end(vargs);
1148 return ret;
1149}
1150
1151
1152
1153static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001154indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001156 tok->done = E_TABSPACE;
1157 tok->cur = tok->inp;
1158 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159}
1160
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001161static int
1162parser_warn(struct tok_state *tok, const char *format, ...)
1163{
1164 PyObject *errmsg;
1165 va_list vargs;
1166#ifdef HAVE_STDARG_PROTOTYPES
1167 va_start(vargs, format);
1168#else
1169 va_start(vargs);
1170#endif
1171 errmsg = PyUnicode_FromFormatV(format, vargs);
1172 va_end(vargs);
1173 if (!errmsg) {
1174 goto error;
1175 }
1176
1177 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1178 tok->lineno, NULL, NULL) < 0) {
1179 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1180 /* Replace the DeprecationWarning exception with a SyntaxError
1181 to get a more accurate error report */
1182 PyErr_Clear();
1183 syntaxerror(tok, "%U", errmsg);
1184 }
1185 goto error;
1186 }
1187 Py_DECREF(errmsg);
1188 return 0;
1189
1190error:
1191 Py_XDECREF(errmsg);
1192 tok->done = E_ERROR;
1193 return -1;
1194}
1195
1196static int
1197lookahead(struct tok_state *tok, const char *test)
1198{
1199 const char *s = test;
1200 int res = 0;
1201 while (1) {
1202 int c = tok_nextc(tok);
1203 if (*s == 0) {
1204 res = !is_potential_identifier_char(c);
1205 }
1206 else if (c == *s) {
1207 s++;
1208 continue;
1209 }
1210
1211 tok_backup(tok, c);
1212 while (s != test) {
1213 tok_backup(tok, *--s);
1214 }
1215 return res;
1216 }
1217}
1218
1219static int
1220verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1221{
1222 /* Emit a deprecation warning only if the numeric literal is immediately
1223 * followed by one of keywords which can occurr after a numeric literal
1224 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1225 * It allows to gradually deprecate existing valid code without adding
1226 * warning before error in most cases of invalid numeric literal (which
1227 * would be confusiong and break existing tests).
1228 * Raise a syntax error with slighly better message than plain
1229 * "invalid syntax" if the numeric literal is immediately followed by
1230 * other keyword or identifier.
1231 */
1232 int r = 0;
1233 if (c == 'a') {
1234 r = lookahead(tok, "nd");
1235 }
1236 else if (c == 'e') {
1237 r = lookahead(tok, "lse");
1238 }
1239 else if (c == 'f') {
1240 r = lookahead(tok, "or");
1241 }
1242 else if (c == 'i') {
1243 int c2 = tok_nextc(tok);
1244 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1245 r = 1;
1246 }
1247 tok_backup(tok, c2);
1248 }
1249 else if (c == 'o') {
1250 r = lookahead(tok, "r");
1251 }
1252 if (r) {
1253 tok_backup(tok, c);
1254 if (parser_warn(tok, "invalid %s literal", kind)) {
1255 return 0;
1256 }
1257 tok_nextc(tok);
1258 }
1259 else /* In future releases, only error will remain. */
1260 if (is_potential_identifier_char(c)) {
1261 tok_backup(tok, c);
1262 syntaxerror(tok, "invalid %s literal", kind);
1263 return 0;
1264 }
1265 return 1;
1266}
1267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268/* Verify that the identifier follows PEP 3131.
1269 All identifier strings are guaranteed to be "ready" unicode objects.
1270 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001271static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001272verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001273{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001275 if (tok->decoding_erred)
1276 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001278 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001280 tok->done = E_DECODE;
1281 }
1282 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 tok->done = E_ERROR;
1284 }
1285 return 0;
1286 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001287 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1288 if (invalid < 0) {
1289 Py_DECREF(s);
1290 tok->done = E_ERROR;
1291 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001292 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001293 assert(PyUnicode_GET_LENGTH(s) > 0);
1294 if (invalid < PyUnicode_GET_LENGTH(s)) {
1295 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1296 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1297 /* Determine the offset in UTF-8 encoded input */
1298 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1299 if (s != NULL) {
1300 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1301 }
1302 if (s == NULL) {
1303 tok->done = E_ERROR;
1304 return 0;
1305 }
1306 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1307 }
1308 Py_DECREF(s);
1309 // PyUnicode_FromFormatV() does not support %X
1310 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001311 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001312 if (Py_UNICODE_ISPRINTABLE(ch)) {
1313 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1314 }
1315 else {
1316 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1317 }
1318 return 0;
1319 }
1320 Py_DECREF(s);
1321 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001322}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001323
Brett Cannona721aba2016-09-09 14:57:09 -07001324static int
1325tok_decimal_tail(struct tok_state *tok)
1326{
1327 int c;
1328
1329 while (1) {
1330 do {
1331 c = tok_nextc(tok);
1332 } while (isdigit(c));
1333 if (c != '_') {
1334 break;
1335 }
1336 c = tok_nextc(tok);
1337 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001338 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001339 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001340 return 0;
1341 }
1342 }
1343 return c;
1344}
1345
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346/* Get next token, after space stripping etc. */
1347
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001348static int
Andy Lester384f3c52020-02-27 20:44:52 -06001349tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001351 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001353
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001355 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 tok->start = NULL;
1357 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001358
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 /* Get indentation level */
1360 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001361 int col = 0;
1362 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001363 tok->atbol = 0;
1364 for (;;) {
1365 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001366 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001368 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001370 col = (col / tok->tabsize + 1) * tok->tabsize;
1371 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372 }
Brett Cannona721aba2016-09-09 14:57:09 -07001373 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001375 }
1376 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001378 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 }
1380 tok_backup(tok, c);
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001381 if (c == '#' || c == '\n' || c == '\\') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 /* Lines with only whitespace and/or comments
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001383 and/or a line continuation character
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 shouldn't affect the indentation and are
1385 not passed to the parser as NEWLINE tokens,
1386 except *totally* empty lines in interactive
1387 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001388 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001390 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001391 else if (tok->prompt != NULL && tok->lineno == 1) {
1392 /* In interactive mode, if the first line contains
1393 only spaces and/or a comment, let it through. */
1394 blankline = 0;
1395 col = altcol = 0;
1396 }
Brett Cannona721aba2016-09-09 14:57:09 -07001397 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001398 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001399 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 /* We can't jump back right here since we still
1401 may need to skip to the end of a comment */
1402 }
1403 if (!blankline && tok->level == 0) {
1404 if (col == tok->indstack[tok->indent]) {
1405 /* No change */
1406 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001407 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 }
1409 }
1410 else if (col > tok->indstack[tok->indent]) {
1411 /* Indent -- always one */
1412 if (tok->indent+1 >= MAXINDENT) {
1413 tok->done = E_TOODEEP;
1414 tok->cur = tok->inp;
1415 return ERRORTOKEN;
1416 }
1417 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001418 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419 }
1420 tok->pendin++;
1421 tok->indstack[++tok->indent] = col;
1422 tok->altindstack[tok->indent] = altcol;
1423 }
1424 else /* col < tok->indstack[tok->indent] */ {
1425 /* Dedent -- any number, must be consistent */
1426 while (tok->indent > 0 &&
1427 col < tok->indstack[tok->indent]) {
1428 tok->pendin--;
1429 tok->indent--;
1430 }
1431 if (col != tok->indstack[tok->indent]) {
1432 tok->done = E_DEDENT;
1433 tok->cur = tok->inp;
1434 return ERRORTOKEN;
1435 }
1436 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001437 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 }
1439 }
1440 }
1441 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001442
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 /* Return pending indents/dedents */
1446 if (tok->pendin != 0) {
1447 if (tok->pendin < 0) {
1448 tok->pendin++;
1449 return DEDENT;
1450 }
1451 else {
1452 tok->pendin--;
1453 return INDENT;
1454 }
1455 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001456
Guido van Rossum495da292019-03-07 12:38:08 -08001457 /* Peek ahead at the next character */
1458 c = tok_nextc(tok);
1459 tok_backup(tok, c);
1460 /* Check if we are closing an async function */
1461 if (tok->async_def
1462 && !blankline
1463 /* Due to some implementation artifacts of type comments,
1464 * a TYPE_COMMENT at the start of a function won't set an
1465 * indentation level and it will produce a NEWLINE after it.
1466 * To avoid spuriously ending an async function due to this,
1467 * wait until we have some non-newline char in front of us. */
1468 && c != '\n'
1469 && tok->level == 0
1470 /* There was a NEWLINE after ASYNC DEF,
1471 so we're past the signature. */
1472 && tok->async_def_nl
1473 /* Current indentation level is less than where
1474 the async function was defined */
1475 && tok->async_def_indent >= tok->indent)
1476 {
1477 tok->async_def = 0;
1478 tok->async_def_indent = 0;
1479 tok->async_def_nl = 0;
1480 }
1481
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 tok->start = NULL;
1484 /* Skip spaces */
1485 do {
1486 c = tok_nextc(tok);
1487 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001488
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001489 /* Set start of current token */
1490 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001492 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001493 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001494 const char *prefix, *p, *type_start;
1495
Brett Cannona721aba2016-09-09 14:57:09 -07001496 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001498 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001499
1500 if (tok->type_comments) {
1501 p = tok->start;
1502 prefix = type_comment_prefix;
1503 while (*prefix && p < tok->cur) {
1504 if (*prefix == ' ') {
1505 while (*p == ' ' || *p == '\t') {
1506 p++;
1507 }
1508 } else if (*prefix == *p) {
1509 p++;
1510 } else {
1511 break;
1512 }
1513
1514 prefix++;
1515 }
1516
1517 /* This is a type comment if we matched all of type_comment_prefix. */
1518 if (!*prefix) {
1519 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001520 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001521 tok_backup(tok, c); /* don't eat the newline or EOF */
1522
1523 type_start = p;
1524
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001525 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001526 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001527 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001528 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001529 && !(tok->cur > ignore_end
1530 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001531
1532 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001533 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001534 *p_end = tok->cur;
1535
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001536 /* If this type ignore is the only thing on the line, consume the newline also. */
1537 if (blankline) {
1538 tok_nextc(tok);
1539 tok->atbol = 1;
1540 }
1541 return TYPE_IGNORE;
1542 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001543 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001544 *p_end = tok->cur;
1545 return TYPE_COMMENT;
1546 }
1547 }
1548 }
Brett Cannona721aba2016-09-09 14:57:09 -07001549 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550
Miss Islington (bot)1fb6b9e2021-05-22 15:23:26 -07001551 if (tok->done == E_INTERACT_STOP) {
1552 return ENDMARKER;
1553 }
1554
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 /* Check for EOF and errors now */
1556 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001557 if (tok->level) {
1558 return ERRORTOKEN;
1559 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001560 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1561 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001562
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 /* Identifier (most frequent token!) */
1564 nonascii = 0;
1565 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001566 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001567 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001568 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001569 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001570 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001571 /* Since this is a backwards compatibility support literal we don't
1572 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001573 else if (!(saw_b || saw_u || saw_r || saw_f)
1574 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001575 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001576 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001577 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001578 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001579 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001580 }
1581 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001582 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001583 }
1584 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001585 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001586 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001588 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001590 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 }
1592 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001593 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001595 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 c = tok_nextc(tok);
1597 }
1598 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001599 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001601 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001602
1603 *p_start = tok->start;
1604 *p_end = tok->cur;
1605
Guido van Rossum495da292019-03-07 12:38:08 -08001606 /* async/await parsing block. */
1607 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1608 /* May be an 'async' or 'await' token. For Python 3.7 or
1609 later we recognize them unconditionally. For Python
1610 3.5 or 3.6 we recognize 'async' in front of 'def', and
1611 either one inside of 'async def'. (Technically we
1612 shouldn't recognize these at all for 3.4 or earlier,
1613 but there's no *valid* Python 3.4 code that would be
1614 rejected, and async functions will be rejected in a
1615 later phase.) */
1616 if (!tok->async_hacks || tok->async_def) {
1617 /* Always recognize the keywords. */
1618 if (memcmp(tok->start, "async", 5) == 0) {
1619 return ASYNC;
1620 }
1621 if (memcmp(tok->start, "await", 5) == 0) {
1622 return AWAIT;
1623 }
1624 }
1625 else if (memcmp(tok->start, "async", 5) == 0) {
1626 /* The current token is 'async'.
1627 Look ahead one token to see if that is 'def'. */
1628
1629 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001630 const char *ahead_tok_start = NULL;
1631 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001632 int ahead_tok_kind;
1633
1634 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1635 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1636 &ahead_tok_end);
1637
1638 if (ahead_tok_kind == NAME
1639 && ahead_tok.cur - ahead_tok.start == 3
1640 && memcmp(ahead_tok.start, "def", 3) == 0)
1641 {
1642 /* The next token is going to be 'def', so instead of
1643 returning a plain NAME token, return ASYNC. */
1644 tok->async_def_indent = tok->indent;
1645 tok->async_def = 1;
1646 return ASYNC;
1647 }
1648 }
1649 }
1650
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 return NAME;
1652 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Newline */
1655 if (c == '\n') {
1656 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001657 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001659 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 *p_start = tok->start;
1661 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1662 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001663 if (tok->async_def) {
1664 /* We're somewhere inside an 'async def' function, and
1665 we've encountered a NEWLINE after its signature. */
1666 tok->async_def_nl = 1;
1667 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 return NEWLINE;
1669 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001670
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 /* Period or number starting with period? */
1672 if (c == '.') {
1673 c = tok_nextc(tok);
1674 if (isdigit(c)) {
1675 goto fraction;
1676 } else if (c == '.') {
1677 c = tok_nextc(tok);
1678 if (c == '.') {
1679 *p_start = tok->start;
1680 *p_end = tok->cur;
1681 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001682 }
1683 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 tok_backup(tok, c);
1685 }
1686 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001687 }
1688 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 tok_backup(tok, c);
1690 }
1691 *p_start = tok->start;
1692 *p_end = tok->cur;
1693 return DOT;
1694 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001695
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 /* Number */
1697 if (isdigit(c)) {
1698 if (c == '0') {
1699 /* Hex, octal or binary -- maybe. */
1700 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 /* Hex */
1703 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001705 if (c == '_') {
1706 c = tok_nextc(tok);
1707 }
1708 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001709 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001710 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001711 }
1712 do {
1713 c = tok_nextc(tok);
1714 } while (isxdigit(c));
1715 } while (c == '_');
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001716 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1717 return ERRORTOKEN;
1718 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 }
1720 else if (c == 'o' || c == 'O') {
1721 /* Octal */
1722 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001724 if (c == '_') {
1725 c = tok_nextc(tok);
1726 }
1727 if (c < '0' || c >= '8') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001728 if (isdigit(c)) {
1729 return syntaxerror(tok,
1730 "invalid digit '%c' in octal literal", c);
1731 }
1732 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001733 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001734 return syntaxerror(tok, "invalid octal literal");
1735 }
Brett Cannona721aba2016-09-09 14:57:09 -07001736 }
1737 do {
1738 c = tok_nextc(tok);
1739 } while ('0' <= c && c < '8');
1740 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001741 if (isdigit(c)) {
1742 return syntaxerror(tok,
1743 "invalid digit '%c' in octal literal", c);
1744 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001745 if (!verify_end_of_number(tok, c, "octal")) {
1746 return ERRORTOKEN;
1747 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 }
1749 else if (c == 'b' || c == 'B') {
1750 /* Binary */
1751 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001753 if (c == '_') {
1754 c = tok_nextc(tok);
1755 }
1756 if (c != '0' && c != '1') {
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001757 if (isdigit(c)) {
1758 return syntaxerror(tok,
1759 "invalid digit '%c' in binary literal", c);
1760 }
1761 else {
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001762 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001763 return syntaxerror(tok, "invalid binary literal");
1764 }
Brett Cannona721aba2016-09-09 14:57:09 -07001765 }
1766 do {
1767 c = tok_nextc(tok);
1768 } while (c == '0' || c == '1');
1769 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001770 if (isdigit(c)) {
1771 return syntaxerror(tok,
1772 "invalid digit '%c' in binary literal", c);
1773 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001774 if (!verify_end_of_number(tok, c, "binary")) {
1775 return ERRORTOKEN;
1776 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 }
1778 else {
1779 int nonzero = 0;
1780 /* maybe old-style octal; c is first char of it */
1781 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001782 while (1) {
1783 if (c == '_') {
1784 c = tok_nextc(tok);
1785 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001786 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001787 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001788 }
1789 }
1790 if (c != '0') {
1791 break;
1792 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001793 c = tok_nextc(tok);
1794 }
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001795 char* zeros_end = tok->cur;
Brett Cannona721aba2016-09-09 14:57:09 -07001796 if (isdigit(c)) {
1797 nonzero = 1;
1798 c = tok_decimal_tail(tok);
1799 if (c == 0) {
1800 return ERRORTOKEN;
1801 }
1802 }
1803 if (c == '.') {
1804 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001805 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001806 }
1807 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001808 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001809 }
1810 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001812 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001813 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001814 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001815 tok_backup(tok, c);
Miss Islington (bot)2a722d42021-07-09 17:47:33 -07001816 return syntaxerror_known_range(
1817 tok, (int)(tok->start + 1 - tok->line_start),
1818 (int)(zeros_end - tok->line_start),
1819 "leading zeros in decimal integer "
1820 "literals are not permitted; "
1821 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001822 }
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001823 if (!verify_end_of_number(tok, c, "decimal")) {
1824 return ERRORTOKEN;
1825 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001826 }
1827 }
1828 else {
1829 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001830 c = tok_decimal_tail(tok);
1831 if (c == 0) {
1832 return ERRORTOKEN;
1833 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001834 {
1835 /* Accept floating point numbers. */
1836 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001837 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001838 fraction:
1839 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001840 if (isdigit(c)) {
1841 c = tok_decimal_tail(tok);
1842 if (c == 0) {
1843 return ERRORTOKEN;
1844 }
1845 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001846 }
1847 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001848 int e;
1849 exponent:
1850 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 /* Exponent part */
1852 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001853 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001854 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001855 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001856 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001857 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001858 }
1859 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001860 tok_backup(tok, c);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001861 if (!verify_end_of_number(tok, e, "decimal")) {
1862 return ERRORTOKEN;
1863 }
Benjamin Petersonc4161622014-06-07 12:36:39 -07001864 tok_backup(tok, e);
1865 *p_start = tok->start;
1866 *p_end = tok->cur;
1867 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001868 }
Brett Cannona721aba2016-09-09 14:57:09 -07001869 c = tok_decimal_tail(tok);
1870 if (c == 0) {
1871 return ERRORTOKEN;
1872 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001873 }
Brett Cannona721aba2016-09-09 14:57:09 -07001874 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001875 /* Imaginary part */
1876 imaginary:
1877 c = tok_nextc(tok);
Miss Islington (bot)eeefa7f2021-06-08 16:52:23 -07001878 if (!verify_end_of_number(tok, c, "imaginary")) {
1879 return ERRORTOKEN;
1880 }
1881 }
1882 else if (!verify_end_of_number(tok, c, "decimal")) {
1883 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001884 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001885 }
1886 }
1887 tok_backup(tok, c);
1888 *p_start = tok->start;
1889 *p_end = tok->cur;
1890 return NUMBER;
1891 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001892
1893 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001894 /* String */
1895 if (c == '\'' || c == '"') {
1896 int quote = c;
1897 int quote_size = 1; /* 1 or 3 */
1898 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001899
Anthony Sottile995d9b92019-01-12 20:05:13 -08001900 /* Nodes of type STRING, especially multi line strings
1901 must be handled differently in order to get both
1902 the starting line number and the column offset right.
1903 (cf. issue 16806) */
1904 tok->first_lineno = tok->lineno;
1905 tok->multi_line_start = tok->line_start;
1906
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001907 /* Find the quote size and start of string */
1908 c = tok_nextc(tok);
1909 if (c == quote) {
1910 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001911 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001912 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001913 }
1914 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001915 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001916 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001917 }
Brett Cannona721aba2016-09-09 14:57:09 -07001918 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001919 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001920 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001921
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001922 /* Get rest of string */
1923 while (end_quote_size != quote_size) {
1924 c = tok_nextc(tok);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001925 if (c == EOF || (quote_size == 1 && c == '\n')) {
Miss Islington (bot)d03f3422021-06-12 13:27:02 -07001926 assert(tok->multi_line_start != NULL);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001927 // shift the tok_state's location into
1928 // the start of string, and report the error
1929 // from the initial quote character
1930 tok->cur = (char *)tok->start;
1931 tok->cur++;
1932 tok->line_start = tok->multi_line_start;
1933 int start = tok->lineno;
1934 tok->lineno = tok->first_lineno;
1935
Brett Cannona721aba2016-09-09 14:57:09 -07001936 if (quote_size == 3) {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001937 return syntaxerror(tok,
1938 "unterminated triple-quoted string literal"
1939 " (detected at line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001940 }
1941 else {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001942 return syntaxerror(tok,
1943 "unterminated string literal (detected at"
1944 " line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001945 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001946 }
Brett Cannona721aba2016-09-09 14:57:09 -07001947 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001948 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001949 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001950 else {
1951 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001952 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001953 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001954 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001955 }
1956 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001957
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001958 *p_start = tok->start;
1959 *p_end = tok->cur;
1960 return STRING;
1961 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001962
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001963 /* Line continuation */
1964 if (c == '\\') {
1965 c = tok_nextc(tok);
1966 if (c != '\n') {
1967 tok->done = E_LINECONT;
1968 tok->cur = tok->inp;
1969 return ERRORTOKEN;
1970 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001971 c = tok_nextc(tok);
1972 if (c == EOF) {
1973 tok->done = E_EOF;
1974 tok->cur = tok->inp;
1975 return ERRORTOKEN;
1976 } else {
1977 tok_backup(tok, c);
1978 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001979 tok->cont_line = 1;
1980 goto again; /* Read next line */
1981 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001982
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001983 /* Check for two-character token */
1984 {
1985 int c2 = tok_nextc(tok);
1986 int token = PyToken_TwoChars(c, c2);
1987 if (token != OP) {
1988 int c3 = tok_nextc(tok);
1989 int token3 = PyToken_ThreeChars(c, c2, c3);
1990 if (token3 != OP) {
1991 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001992 }
1993 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001994 tok_backup(tok, c3);
1995 }
1996 *p_start = tok->start;
1997 *p_end = tok->cur;
1998 return token;
1999 }
2000 tok_backup(tok, c2);
2001 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002003 /* Keep track of parentheses nesting level */
2004 switch (c) {
2005 case '(':
2006 case '[':
2007 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002008 if (tok->level >= MAXLEVEL) {
2009 return syntaxerror(tok, "too many nested parentheses");
2010 }
2011 tok->parenstack[tok->level] = c;
2012 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindoae7d3cd92021-01-20 12:53:52 +00002013 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002014 tok->level++;
2015 break;
2016 case ')':
2017 case ']':
2018 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002019 if (!tok->level) {
2020 return syntaxerror(tok, "unmatched '%c'", c);
2021 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002022 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02002023 int opening = tok->parenstack[tok->level];
2024 if (!((opening == '(' && c == ')') ||
2025 (opening == '[' && c == ']') ||
2026 (opening == '{' && c == '}')))
2027 {
2028 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2029 return syntaxerror(tok,
2030 "closing parenthesis '%c' does not match "
2031 "opening parenthesis '%c' on line %d",
2032 c, opening, tok->parenlinenostack[tok->level]);
2033 }
2034 else {
2035 return syntaxerror(tok,
2036 "closing parenthesis '%c' does not match "
2037 "opening parenthesis '%c'",
2038 c, opening);
2039 }
2040 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002041 break;
2042 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002043
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002044 /* Punctuation character */
2045 *p_start = tok->start;
2046 *p_end = tok->cur;
2047 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002048}
2049
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002050int
Andy Lester384f3c52020-02-27 20:44:52 -06002051PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002052{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002053 int result = tok_get(tok, p_start, p_end);
2054 if (tok->decoding_erred) {
2055 result = ERRORTOKEN;
2056 tok->done = E_DECODE;
2057 }
2058 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00002059}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002060
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002061/* Get the encoding of a Python file. Check for the coding cookie and check if
2062 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002063
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002064 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2065 encoding in the first or second line of the file (in which case the encoding
2066 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00002067
Victor Stinner00d7abd2020-12-01 09:56:42 +01002068 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002069 by the caller. */
2070
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002071char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002072PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00002073{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002074 struct tok_state *tok;
2075 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06002076 const char *p_start = NULL;
2077 const char *p_end = NULL;
2078 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002079
Victor Stinnerdaf45552013-08-28 00:53:59 +02002080 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002081 if (fd < 0) {
2082 return NULL;
2083 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02002084
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002085 fp = fdopen(fd, "r");
2086 if (fp == NULL) {
2087 return NULL;
2088 }
2089 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2090 if (tok == NULL) {
2091 fclose(fp);
2092 return NULL;
2093 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002094 if (filename != NULL) {
2095 Py_INCREF(filename);
2096 tok->filename = filename;
2097 }
2098 else {
2099 tok->filename = PyUnicode_FromString("<string>");
2100 if (tok->filename == NULL) {
2101 fclose(fp);
2102 PyTokenizer_Free(tok);
2103 return encoding;
2104 }
2105 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002106 while (tok->lineno < 2 && tok->done == E_OK) {
2107 PyTokenizer_Get(tok, &p_start, &p_end);
2108 }
2109 fclose(fp);
2110 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01002111 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Pablo Galindo261a4522021-03-28 23:48:05 +01002112 if (encoding) {
Hansraj Das69f37bc2019-08-15 21:49:07 +05302113 strcpy(encoding, tok->encoding);
Pablo Galindo261a4522021-03-28 23:48:05 +01002114 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002115 }
2116 PyTokenizer_Free(tok);
2117 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00002118}
Thomas Wouters89d996e2007-09-08 17:39:28 +00002119
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02002120char *
2121PyTokenizer_FindEncoding(int fd)
2122{
2123 return PyTokenizer_FindEncodingFilename(fd, NULL);
2124}
2125
Guido van Rossum408027e1996-12-30 16:17:54 +00002126#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002127
2128void
Thomas Wouters23c9e002000-07-22 19:20:54 +00002129tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002130{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002131 printf("%s", _PyParser_TokenNames[type]);
2132 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2133 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002134}
2135
2136#endif