blob: ad32293d70b7855c5a8ae28e58936381b7ec5348 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Guido van Rossumf4b1a641994-08-29 12:43:07 +000034
Guido van Rossum4fe87291992-02-26 15:24:44 +000035/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000036#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037
Guido van Rossum3f5da241990-12-20 15:06:42 +000038/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000039static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000042
Brett Cannond5ec98c2007-10-20 02:54:14 +000043
Guido van Rossumdcfcd142019-01-31 03:40:27 -080044/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000051tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052{
Victor Stinner00d7abd2020-12-01 09:56:42 +010053 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060057 tok->buf = tok->cur = tok->inp = NULL;
Pablo Galindocd8dcbc2021-03-14 04:38:40 +010058 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060061 tok->start = NULL;
62 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020080 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080083 tok->type_comments = 0;
Guido van Rossum495da292019-03-07 12:38:08 -080084 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
88
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090}
91
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000092static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070093new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000094{
Victor Stinner00d7abd2020-12-01 09:56:42 +010095 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070096 if (!result) {
97 tok->done = E_NOMEM;
98 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700100 memcpy(result, s, len);
101 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000103}
104
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000105static char *
106error_ret(struct tok_state *tok) /* XXX */
107{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 tok->decoding_erred = 1;
109 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100110 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600111 tok->buf = tok->cur = tok->inp = NULL;
112 tok->start = NULL;
113 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200114 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000116}
117
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000118
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200119static const char *
120get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000122 char buf[13];
123 int i;
124 for (i = 0; i < 12; i++) {
125 int c = s[i];
126 if (c == '\0')
127 break;
128 else if (c == '_')
129 buf[i] = '-';
130 else
131 buf[i] = tolower(c);
132 }
133 buf[i] = '\0';
134 if (strcmp(buf, "utf-8") == 0 ||
135 strncmp(buf, "utf-8-", 6) == 0)
136 return "utf-8";
137 else if (strcmp(buf, "latin-1") == 0 ||
138 strcmp(buf, "iso-8859-1") == 0 ||
139 strcmp(buf, "iso-latin-1") == 0 ||
140 strncmp(buf, "latin-1-", 8) == 0 ||
141 strncmp(buf, "iso-8859-1-", 11) == 0 ||
142 strncmp(buf, "iso-latin-1-", 12) == 0)
143 return "iso-8859-1";
144 else
145 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146}
147
148/* Return the coding spec in S, or NULL if none is found. */
149
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150static int
151get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700154 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 /* Coding spec must be in a comment, and that comment must be
156 * the only statement on the source code line. */
157 for (i = 0; i < size - 6; i++) {
158 if (s[i] == '#')
159 break;
160 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700161 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
163 for (; i < size - 6; i++) { /* XXX inefficient search */
164 const char* t = s + i;
Pablo Galindo261a4522021-03-28 23:48:05 +0100165 if (memcmp(t, "coding", 6) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 const char* begin = NULL;
167 t += 6;
168 if (t[0] != ':' && t[0] != '=')
169 continue;
170 do {
171 t++;
Pablo Galindo261a4522021-03-28 23:48:05 +0100172 } while (t[0] == ' ' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 begin = t;
175 while (Py_ISALNUM(t[0]) ||
176 t[0] == '-' || t[0] == '_' || t[0] == '.')
177 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700180 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200181 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700182 if (!r)
183 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700184 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100186 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 r = new_string(q, strlen(q), tok);
188 if (!r)
189 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700191 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200192 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 }
194 }
195 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700196 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197}
198
199/* Check whether the line contains a coding spec. If it does,
200 invoke the set_readline function for the new encoding.
201 This function receives the tok_state and the new encoding.
202 Return 1 on success, 0 on failure. */
203
204static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000205check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000206 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700208 char *cs;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200209 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 /* It's a continuation line, so it can't be a coding spec. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100211 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000212 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200213 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100214 if (!get_coding_spec(line, &cs, size, tok)) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700215 return 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100216 }
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200217 if (!cs) {
218 Py_ssize_t i;
219 for (i = 0; i < size; i++) {
220 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
221 break;
222 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
223 /* Stop checking coding spec after a line containing
224 * anything except a comment. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100225 tok->decoding_state = STATE_NORMAL;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200226 break;
227 }
228 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700229 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200230 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100231 tok->decoding_state = STATE_NORMAL;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700232 if (tok->encoding == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100233 assert(tok->decoding_readline == NULL);
234 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
235 error_ret(tok);
236 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
237 PyMem_Free(cs);
238 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100240 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700241 } else { /* then, compare cs with BOM */
Pablo Galindo261a4522021-03-28 23:48:05 +0100242 if (strcmp(tok->encoding, cs) != 0) {
243 error_ret(tok);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700244 PyErr_Format(PyExc_SyntaxError,
245 "encoding problem: %s with BOM", cs);
Pablo Galindo261a4522021-03-28 23:48:05 +0100246 PyMem_Free(cs);
247 return 0;
248 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100249 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100251 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000252}
253
254/* See whether the file starts with a BOM. If it does,
255 invoke the set_readline function with the new encoding.
256 Return 1 on success, 0 on failure. */
257
258static int
259check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 void unget_char(int, struct tok_state *),
261 int set_readline(struct tok_state *, const char *),
262 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000263{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000264 int ch1, ch2, ch3;
265 ch1 = get_char(tok);
Pablo Galindo261a4522021-03-28 23:48:05 +0100266 tok->decoding_state = STATE_SEEK_CODING;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 if (ch1 == EOF) {
268 return 1;
269 } else if (ch1 == 0xEF) {
270 ch2 = get_char(tok);
271 if (ch2 != 0xBB) {
272 unget_char(ch2, tok);
273 unget_char(ch1, tok);
274 return 1;
275 }
276 ch3 = get_char(tok);
277 if (ch3 != 0xBF) {
278 unget_char(ch3, tok);
279 unget_char(ch2, tok);
280 unget_char(ch1, tok);
281 return 1;
282 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 /* Disable support for UTF-16 BOMs until a decision
285 is made whether this needs to be supported. */
286 } else if (ch1 == 0xFE) {
287 ch2 = get_char(tok);
288 if (ch2 != 0xFF) {
289 unget_char(ch2, tok);
290 unget_char(ch1, tok);
291 return 1;
292 }
293 if (!set_readline(tok, "utf-16-be"))
294 return 0;
295 tok->decoding_state = STATE_NORMAL;
296 } else if (ch1 == 0xFF) {
297 ch2 = get_char(tok);
298 if (ch2 != 0xFE) {
299 unget_char(ch2, tok);
300 unget_char(ch1, tok);
301 return 1;
302 }
303 if (!set_readline(tok, "utf-16-le"))
304 return 0;
305 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000306#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000307 } else {
308 unget_char(ch1, tok);
309 return 1;
310 }
311 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100312 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700313 tok->encoding = new_string("utf-8", 5, tok);
314 if (!tok->encoding)
315 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 /* No need to set_readline: input is already utf-8 */
317 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318}
319
Pablo Galindo261a4522021-03-28 23:48:05 +0100320static int
321tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100322 assert(tok->fp_interactive);
323
324 if (!line) {
325 return 0;
326 }
327
328 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
329 Py_ssize_t line_size = strlen(line);
330 char* new_str = tok->interactive_src_start;
331
332 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
333 if (!new_str) {
334 if (tok->interactive_src_start) {
335 PyMem_Free(tok->interactive_src_start);
336 }
337 tok->interactive_src_start = NULL;
338 tok->interactive_src_end = NULL;
339 tok->done = E_NOMEM;
340 return -1;
341 }
342 strcpy(new_str + current_size, line);
343
344 tok->interactive_src_start = new_str;
345 tok->interactive_src_end = new_str + current_size + line_size;
346 return 0;
347}
348
349
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000353 On entry, tok->decoding_buffer will be one of:
354 1) NULL: need to call tok->decoding_readline to get a new line
355 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 stored the result in tok->decoding_buffer
Pablo Galindo261a4522021-03-28 23:48:05 +0100357 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 (in the s buffer) to copy entire contents of the line read
359 by tok->decoding_readline. tok->decoding_buffer has the overflow.
Pablo Galindo261a4522021-03-28 23:48:05 +0100360 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000361 until the buffer ends with a '\n' (or until the end of the file is
Pablo Galindo261a4522021-03-28 23:48:05 +0100362 reached): see tok_nextc and its calls to tok_reserve_buf.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364
Pablo Galindo261a4522021-03-28 23:48:05 +0100365static int
366tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367{
Pablo Galindo261a4522021-03-28 23:48:05 +0100368 Py_ssize_t cur = tok->cur - tok->buf;
369 Py_ssize_t oldsize = tok->inp - tok->buf;
370 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
371 if (newsize > tok->end - tok->buf) {
372 char *newbuf = tok->buf;
373 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
374 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
375 if (newbuf == NULL) {
376 tok->done = E_NOMEM;
377 return 0;
378 }
379 tok->buf = newbuf;
380 tok->cur = tok->buf + cur;
381 tok->inp = tok->buf + oldsize;
382 tok->end = tok->buf + newsize;
383 tok->start = start < 0 ? NULL : tok->buf + start;
384 }
385 return 1;
386}
387
388static int
389tok_readline_recode(struct tok_state *tok) {
390 PyObject *line;
391 const char *buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 Py_ssize_t buflen;
Pablo Galindo261a4522021-03-28 23:48:05 +0100393 line = tok->decoding_buffer;
394 if (line == NULL) {
395 line = PyObject_CallNoArgs(tok->decoding_readline);
396 if (line == NULL) {
397 error_ret(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 goto error;
399 }
400 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100401 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000402 tok->decoding_buffer = NULL;
Pablo Galindo261a4522021-03-28 23:48:05 +0100403 }
404 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
405 if (buf == NULL) {
406 error_ret(tok);
407 goto error;
408 }
409 if (!tok_reserve_buf(tok, buflen + 1)) {
410 goto error;
411 }
412 memcpy(tok->inp, buf, buflen);
413 tok->inp += buflen;
414 *tok->inp = '\0';
415 if (tok->fp_interactive &&
416 tok_concatenate_interactive_new_line(tok, buf) == -1) {
417 goto error;
418 }
419 Py_DECREF(line);
420 return 1;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000421error:
Pablo Galindo261a4522021-03-28 23:48:05 +0100422 Py_XDECREF(line);
423 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424}
425
426/* Set the readline function for TOK to a StreamReader's
427 readline function. The StreamReader is named ENC.
428
429 This function is called from check_bom and check_coding_spec.
430
431 ENC is usually identical to the future value of tok->encoding,
432 except for the (currently unsupported) case of UTF-16.
433
434 Return 1 on success, 0 on failure. */
435
436static int
437fp_setreadl(struct tok_state *tok, const char* enc)
438{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700439 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200440 _Py_IDENTIFIER(open);
441 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000442 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200443 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
Victor Stinner22a351a2010-10-14 12:04:34 +0000445 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200446 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100447 * position of tok->fp. If tok->fp was opened in text mode on Windows,
448 * its file position counts CRLF as one char and can't be directly mapped
449 * to the file offset for fd. Instead we step back one byte and read to
450 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200451 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100452 if (pos == -1 ||
453 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000454 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700455 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000456 }
457
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700458 io = PyImport_ImportModuleNoBlock("io");
459 if (io == NULL)
460 return 0;
461
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200462 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000463 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700464 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700466 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000467
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200468 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700469 Py_DECREF(stream);
470 if (readline == NULL)
471 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300472 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700473
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100474 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100475 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700476 if (bufobj == NULL)
477 return 0;
478 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100479 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000480
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700481 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482}
483
484/* Fetch the next byte from TOK. */
485
486static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000487 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488}
489
490/* Unfetch the last byte back into TOK. */
491
492static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000493 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494}
495
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000496/* Check whether the characters at s start a valid
497 UTF-8 sequence. Return the number of characters forming
498 the sequence if yes, 0 if not. */
499static int valid_utf8(const unsigned char* s)
500{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000501 int expected = 0;
502 int length;
503 if (*s < 0x80)
504 /* single-byte code */
505 return 1;
506 if (*s < 0xc0)
507 /* following byte */
508 return 0;
509 if (*s < 0xE0)
510 expected = 1;
511 else if (*s < 0xF0)
512 expected = 2;
513 else if (*s < 0xF8)
514 expected = 3;
515 else
516 return 0;
517 length = expected + 1;
518 for (; expected; expected--)
519 if (s[expected] < 0x80 || s[expected] >= 0xC0)
520 return 0;
521 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000522}
523
Pablo Galindo261a4522021-03-28 23:48:05 +0100524static int
525ensure_utf8(char *line, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000527 int badchar = 0;
Pablo Galindo261a4522021-03-28 23:48:05 +0100528 unsigned char *c;
529 int length;
530 for (c = (unsigned char *)line; *c; c += length) {
531 if (!(length = valid_utf8(c))) {
532 badchar = *c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000533 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000534 }
535 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 if (badchar) {
537 /* Need to add 1 to the line number, since this line
Pablo Galindo261a4522021-03-28 23:48:05 +0100538 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200539 PyErr_Format(PyExc_SyntaxError,
Pablo Galindo261a4522021-03-28 23:48:05 +0100540 "Non-UTF-8 code starting with '\\x%.2x' "
541 "in file %U on line %i, "
542 "but no encoding declared; "
543 "see http://python.org/dev/peps/pep-0263/ for details",
544 badchar, tok->filename, tok->lineno + 1);
545 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100547 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548}
549
550/* Fetch a byte from TOK, using the string buffer. */
551
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000552static int
553buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000554 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555}
556
557/* Unfetch a byte from TOK, using the string buffer. */
558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000559static void
560buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 tok->str--;
562 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563}
564
565/* Set the readline function for TOK to ENC. For the string-based
566 tokenizer, this means to just record the encoding. */
567
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000568static int
569buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000570 tok->enc = enc;
571 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572}
573
574/* Return a UTF-8 encoding Python string object from the
575 C byte string STR, which is encoded with ENC. */
576
577static PyObject *
578translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 PyObject *utf8;
580 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
581 if (buf == NULL)
582 return NULL;
583 utf8 = PyUnicode_AsUTF8String(buf);
584 Py_DECREF(buf);
585 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586}
587
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000588
589static char *
590translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200591 int skip_next_lf = 0;
592 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000593 char *buf, *current;
594 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100595 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000596 if (buf == NULL) {
597 tok->done = E_NOMEM;
598 return NULL;
599 }
600 for (current = buf; *s; s++, current++) {
601 c = *s;
602 if (skip_next_lf) {
603 skip_next_lf = 0;
604 if (c == '\n') {
605 c = *++s;
606 if (!c)
607 break;
608 }
609 }
610 if (c == '\r') {
611 skip_next_lf = 1;
612 c = '\n';
613 }
614 *current = c;
615 }
616 /* If this is exec input, add a newline to the end of the string if
617 there isn't one already. */
618 if (exec_input && c != '\n') {
619 *current = '\n';
620 current++;
621 }
622 *current = '\0';
623 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000624 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100626 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000627 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100628 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000629 }
630 buf = result;
631 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000633}
634
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635/* Decode a byte string STR for use as the buffer of TOK.
636 Look for encoding declarations inside STR, and record them
637 inside TOK. */
638
Andy Lester384f3c52020-02-27 20:44:52 -0600639static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000640decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600643 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 const char *s;
645 const char *newl[2] = {NULL, NULL};
646 int lineno = 0;
647 tok->input = str = translate_newlines(input, single, tok);
648 if (str == NULL)
649 return NULL;
650 tok->enc = NULL;
651 tok->str = str;
652 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
653 return error_ret(tok);
654 str = tok->str; /* string after BOM if any */
655 assert(str);
656 if (tok->enc != NULL) {
657 utf8 = translate_into_utf8(str, tok->enc);
658 if (utf8 == NULL)
659 return error_ret(tok);
660 str = PyBytes_AsString(utf8);
661 }
662 for (s = str;; s++) {
663 if (*s == '\0') break;
664 else if (*s == '\n') {
665 assert(lineno < 2);
666 newl[lineno] = s;
667 lineno++;
668 if (lineno == 2) break;
669 }
670 }
671 tok->enc = NULL;
672 /* need to check line 1 and 2 separately since check_coding_spec
673 assumes a single line as input */
674 if (newl[0]) {
Pablo Galindo261a4522021-03-28 23:48:05 +0100675 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
676 return NULL;
677 }
678 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
680 tok, buf_setreadl))
Pablo Galindo261a4522021-03-28 23:48:05 +0100681 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 }
683 }
684 if (tok->enc != NULL) {
685 assert(utf8 == NULL);
686 utf8 = translate_into_utf8(str, tok->enc);
687 if (utf8 == NULL)
688 return error_ret(tok);
689 str = PyBytes_AS_STRING(utf8);
690 }
691 assert(tok->decoding_buffer == NULL);
692 tok->decoding_buffer = utf8; /* CAUTION */
693 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000694}
695
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000696/* Set up tokenizer for string */
697
698struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000699PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000700{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000701 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600702 char *decoded;
703
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 if (tok == NULL)
705 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600706 decoded = decode_str(str, exec_input, tok);
707 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 PyTokenizer_Free(tok);
709 return NULL;
710 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000711
Andy Lester384f3c52020-02-27 20:44:52 -0600712 tok->buf = tok->cur = tok->inp = decoded;
713 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715}
716
Pablo Galindo261a4522021-03-28 23:48:05 +0100717/* Set up tokenizer for UTF-8 string */
718
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000719struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000720PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000721{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600723 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 if (tok == NULL)
725 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600726 tok->input = translated = translate_newlines(str, exec_input, tok);
727 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 PyTokenizer_Free(tok);
729 return NULL;
730 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100731 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600733 tok->str = translated;
Pablo Galindo261a4522021-03-28 23:48:05 +0100734 tok->encoding = new_string("utf-8", 5, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 if (!tok->encoding) {
736 PyTokenizer_Free(tok);
737 return NULL;
738 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000739
Andy Lester384f3c52020-02-27 20:44:52 -0600740 tok->buf = tok->cur = tok->inp = translated;
741 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000743}
744
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000745/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746
747struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300748PyTokenizer_FromFile(FILE *fp, const char* enc,
749 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 struct tok_state *tok = tok_new();
752 if (tok == NULL)
753 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100754 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 PyTokenizer_Free(tok);
756 return NULL;
757 }
758 tok->cur = tok->inp = tok->buf;
759 tok->end = tok->buf + BUFSIZ;
760 tok->fp = fp;
761 tok->prompt = ps1;
762 tok->nextprompt = ps2;
763 if (enc != NULL) {
764 /* Must copy encoding declaration since it
765 gets copied into the parse tree. */
Pablo Galindo261a4522021-03-28 23:48:05 +0100766 tok->encoding = new_string(enc, strlen(enc), tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 if (!tok->encoding) {
768 PyTokenizer_Free(tok);
769 return NULL;
770 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 tok->decoding_state = STATE_NORMAL;
772 }
773 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774}
775
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776/* Free a tok_state structure */
777
778void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000779PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780{
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100781 if (tok->encoding != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100782 PyMem_Free(tok->encoding);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100783 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 Py_XDECREF(tok->decoding_readline);
785 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200786 Py_XDECREF(tok->filename);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100787 if (tok->fp != NULL && tok->buf != NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100788 PyMem_Free(tok->buf);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100789 }
790 if (tok->input) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100791 PyMem_Free(tok->input);
Pablo Galindocd8dcbc2021-03-14 04:38:40 +0100792 }
793 if (tok->interactive_src_start != NULL) {
794 PyMem_Free(tok->interactive_src_start);
795 }
Victor Stinner00d7abd2020-12-01 09:56:42 +0100796 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000797}
798
Pablo Galindo261a4522021-03-28 23:48:05 +0100799static int
800tok_readline_raw(struct tok_state *tok)
801{
802 do {
803 if (!tok_reserve_buf(tok, BUFSIZ)) {
804 return 0;
805 }
806 char *line = Py_UniversalNewlineFgets(tok->inp,
807 (int)(tok->end - tok->inp),
808 tok->fp, NULL);
809 if (line == NULL) {
810 return 1;
811 }
812 if (tok->fp_interactive &&
813 tok_concatenate_interactive_new_line(tok, line) == -1) {
814 return 0;
815 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100816 if (*tok->inp == '\0') {
817 return 0;
818 }
Pablo Galindo261a4522021-03-28 23:48:05 +0100819 tok->inp = strchr(tok->inp, '\0');
820 } while (tok->inp[-1] != '\n');
821 return 1;
822}
823
824static int
825tok_underflow_string(struct tok_state *tok) {
826 char *end = strchr(tok->inp, '\n');
827 if (end != NULL) {
828 end++;
829 }
830 else {
831 end = strchr(tok->inp, '\0');
832 if (end == tok->inp) {
833 tok->done = E_EOF;
834 return 0;
835 }
836 }
837 if (tok->start == NULL) {
838 tok->buf = tok->cur;
839 }
840 tok->line_start = tok->cur;
841 tok->lineno++;
842 tok->inp = end;
843 return 1;
844}
845
846static int
847tok_underflow_interactive(struct tok_state *tok) {
848 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
849 if (newtok != NULL) {
850 char *translated = translate_newlines(newtok, 0, tok);
851 PyMem_Free(newtok);
852 if (translated == NULL) {
853 return 0;
854 }
855 newtok = translated;
856 }
857 if (tok->encoding && newtok && *newtok) {
858 /* Recode to UTF-8 */
859 Py_ssize_t buflen;
860 const char* buf;
861 PyObject *u = translate_into_utf8(newtok, tok->encoding);
862 PyMem_Free(newtok);
863 if (u == NULL) {
864 tok->done = E_DECODE;
865 return 0;
866 }
867 buflen = PyBytes_GET_SIZE(u);
868 buf = PyBytes_AS_STRING(u);
869 newtok = PyMem_Malloc(buflen+1);
870 if (newtok == NULL) {
871 Py_DECREF(u);
872 tok->done = E_NOMEM;
873 return 0;
874 }
875 strcpy(newtok, buf);
876 Py_DECREF(u);
877 }
878 if (tok->fp_interactive &&
879 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
880 PyMem_Free(newtok);
881 return 0;
882 }
883 if (tok->nextprompt != NULL) {
884 tok->prompt = tok->nextprompt;
885 }
886 if (newtok == NULL) {
887 tok->done = E_INTR;
888 }
889 else if (*newtok == '\0') {
890 PyMem_Free(newtok);
891 tok->done = E_EOF;
892 }
893 else if (tok->start != NULL) {
894 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
895 size_t size = strlen(newtok);
896 tok->lineno++;
897 if (!tok_reserve_buf(tok, size + 1)) {
898 PyMem_Free(tok->buf);
899 tok->buf = NULL;
900 PyMem_Free(newtok);
901 return 0;
902 }
903 memcpy(tok->cur, newtok, size + 1);
904 PyMem_Free(newtok);
905 tok->inp += size;
906 tok->multi_line_start = tok->buf + cur_multi_line_start;
907 }
908 else {
909 tok->lineno++;
910 PyMem_Free(tok->buf);
911 tok->buf = newtok;
912 tok->cur = tok->buf;
913 tok->line_start = tok->buf;
914 tok->inp = strchr(tok->buf, '\0');
915 tok->end = tok->inp + 1;
916 }
917 if (tok->done != E_OK) {
918 if (tok->prompt != NULL) {
919 PySys_WriteStderr("\n");
920 }
921 return 0;
922 }
923 return 1;
924}
925
926static int
927tok_underflow_file(struct tok_state *tok) {
928 if (tok->start == NULL) {
929 tok->cur = tok->inp = tok->buf;
930 }
931 if (tok->decoding_state == STATE_INIT) {
932 /* We have not yet determined the encoding.
933 If an encoding is found, use the file-pointer
934 reader functions from now on. */
935 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
936 error_ret(tok);
937 return 0;
938 }
939 assert(tok->decoding_state != STATE_INIT);
940 }
941 /* Read until '\n' or EOF */
942 if (tok->decoding_readline != NULL) {
943 /* We already have a codec associated with this input. */
944 if (!tok_readline_recode(tok)) {
945 return 0;
946 }
947 }
948 else {
949 /* We want a 'raw' read. */
950 if (!tok_readline_raw(tok)) {
951 return 0;
952 }
953 }
954 if (tok->inp == tok->cur) {
955 tok->done = E_EOF;
956 return 0;
957 }
958 if (tok->inp[-1] != '\n') {
959 /* Last line does not end in \n, fake one */
960 *tok->inp++ = '\n';
961 *tok->inp = '\0';
962 }
963
964 tok->lineno++;
965 if (tok->decoding_state != STATE_NORMAL) {
966 if (tok->lineno > 2) {
967 tok->decoding_state = STATE_NORMAL;
968 }
Pablo Galindo92a02c12021-03-30 00:24:49 +0100969 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Pablo Galindo261a4522021-03-28 23:48:05 +0100970 tok, fp_setreadl))
971 {
972 return 0;
973 }
974 }
975 /* The default encoding is UTF-8, so make sure we don't have any
976 non-UTF-8 sequences in it. */
977 if (!tok->encoding
978 && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
979 if (!ensure_utf8(tok->cur, tok)) {
980 error_ret(tok);
981 return 0;
982 }
983 }
984 assert(tok->done == E_OK);
985 return tok->done == E_OK;
986}
987
988static void
989print_escape(FILE *f, const char *s, Py_ssize_t size)
990{
991 if (s == NULL) {
992 fputs("NULL", f);
993 return;
994 }
995 putc('"', f);
996 while (size-- > 0) {
997 unsigned char c = *s++;
998 switch (c) {
999 case '\n': fputs("\\n", f); break;
1000 case '\r': fputs("\\r", f); break;
1001 case '\t': fputs("\\t", f); break;
1002 case '\f': fputs("\\f", f); break;
1003 case '\'': fputs("\\'", f); break;
1004 case '"': fputs("\\\"", f); break;
1005 default:
1006 if (0x20 <= c && c <= 0x7f)
1007 putc(c, f);
1008 else
1009 fprintf(f, "\\x%02x", c);
1010 }
1011 }
1012 putc('"', f);
1013}
1014
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001015/* Get next char, updating state; error code goes into tok->done */
1016
1017static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001018tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001019{
Pablo Galindo261a4522021-03-28 23:48:05 +01001020 int rc;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 for (;;) {
1022 if (tok->cur != tok->inp) {
1023 return Py_CHARMASK(*tok->cur++); /* Fast path */
1024 }
1025 if (tok->done != E_OK)
1026 return EOF;
1027 if (tok->fp == NULL) {
Pablo Galindo261a4522021-03-28 23:48:05 +01001028 rc = tok_underflow_string(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001030 else if (tok->prompt != NULL) {
1031 rc = tok_underflow_interactive(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 }
1033 else {
Pablo Galindo261a4522021-03-28 23:48:05 +01001034 rc = tok_underflow_file(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001035 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001036 if (Py_DebugFlag) {
1037 printf("line[%d] = ", tok->lineno);
1038 print_escape(stdout, tok->cur, tok->inp - tok->cur);
1039 printf(" tok->done = %d\n", tok->done);
1040 }
1041 if (!rc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 tok->cur = tok->inp;
1043 return EOF;
1044 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001045 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001047 Py_UNREACHABLE();
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048}
1049
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001050/* Back-up one character */
1051
1052static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001053tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001056 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001057 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001058 }
Pablo Galindo261a4522021-03-28 23:48:05 +01001059 if ((int)(unsigned char)*tok->cur != c) {
1060 Py_FatalError("tok_backup: wrong character");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001061 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001063}
1064
1065
Guido van Rossum926f13a1998-04-09 21:38:06 +00001066static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001067syntaxerror(struct tok_state *tok, const char *format, ...)
1068{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001069 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001070 va_list vargs;
1071#ifdef HAVE_STDARG_PROTOTYPES
1072 va_start(vargs, format);
1073#else
1074 va_start(vargs);
1075#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001076 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001077 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001078 if (!errmsg) {
1079 goto error;
1080 }
1081
1082 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1083 "replace");
1084 if (!errtext) {
1085 goto error;
1086 }
1087 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1088 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1089 if (line_len != tok->cur - tok->line_start) {
1090 Py_DECREF(errtext);
1091 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1092 "replace");
1093 }
1094 if (!errtext) {
1095 goto error;
1096 }
1097
1098 args = Py_BuildValue("(O(OiiN))", errmsg,
1099 tok->filename, tok->lineno, offset, errtext);
1100 if (args) {
1101 PyErr_SetObject(PyExc_SyntaxError, args);
1102 Py_DECREF(args);
1103 }
1104
1105error:
1106 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001107 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001108 return ERRORTOKEN;
1109}
1110
1111static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001112indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001113{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001114 tok->done = E_TABSPACE;
1115 tok->cur = tok->inp;
1116 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001117}
1118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119/* Verify that the identifier follows PEP 3131.
1120 All identifier strings are guaranteed to be "ready" unicode objects.
1121 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001122static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001123verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001124{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001126 if (tok->decoding_erred)
1127 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001129 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001131 tok->done = E_DECODE;
1132 }
1133 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001134 tok->done = E_ERROR;
1135 }
1136 return 0;
1137 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001138 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1139 if (invalid < 0) {
1140 Py_DECREF(s);
1141 tok->done = E_ERROR;
1142 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001143 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001144 assert(PyUnicode_GET_LENGTH(s) > 0);
1145 if (invalid < PyUnicode_GET_LENGTH(s)) {
1146 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1147 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1148 /* Determine the offset in UTF-8 encoded input */
1149 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1150 if (s != NULL) {
1151 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1152 }
1153 if (s == NULL) {
1154 tok->done = E_ERROR;
1155 return 0;
1156 }
1157 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1158 }
1159 Py_DECREF(s);
1160 // PyUnicode_FromFormatV() does not support %X
1161 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001162 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001163 if (Py_UNICODE_ISPRINTABLE(ch)) {
1164 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1165 }
1166 else {
1167 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1168 }
1169 return 0;
1170 }
1171 Py_DECREF(s);
1172 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001173}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001174
Brett Cannona721aba2016-09-09 14:57:09 -07001175static int
1176tok_decimal_tail(struct tok_state *tok)
1177{
1178 int c;
1179
1180 while (1) {
1181 do {
1182 c = tok_nextc(tok);
1183 } while (isdigit(c));
1184 if (c != '_') {
1185 break;
1186 }
1187 c = tok_nextc(tok);
1188 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001189 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001190 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001191 return 0;
1192 }
1193 }
1194 return c;
1195}
1196
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197/* Get next token, after space stripping etc. */
1198
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001199static int
Andy Lester384f3c52020-02-27 20:44:52 -06001200tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001201{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001202 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001204
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001206 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 tok->start = NULL;
1208 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001209
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210 /* Get indentation level */
1211 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001212 int col = 0;
1213 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 tok->atbol = 0;
1215 for (;;) {
1216 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001217 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001218 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001219 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001221 col = (col / tok->tabsize + 1) * tok->tabsize;
1222 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 }
Brett Cannona721aba2016-09-09 14:57:09 -07001224 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001226 }
1227 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001229 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 }
1231 tok_backup(tok, c);
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001232 if (c == '#' || c == '\n' || c == '\\') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 /* Lines with only whitespace and/or comments
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001234 and/or a line continuation character
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001235 shouldn't affect the indentation and are
1236 not passed to the parser as NEWLINE tokens,
1237 except *totally* empty lines in interactive
1238 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001239 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001241 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001242 else if (tok->prompt != NULL && tok->lineno == 1) {
1243 /* In interactive mode, if the first line contains
1244 only spaces and/or a comment, let it through. */
1245 blankline = 0;
1246 col = altcol = 0;
1247 }
Brett Cannona721aba2016-09-09 14:57:09 -07001248 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001250 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 /* We can't jump back right here since we still
1252 may need to skip to the end of a comment */
1253 }
1254 if (!blankline && tok->level == 0) {
1255 if (col == tok->indstack[tok->indent]) {
1256 /* No change */
1257 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001258 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 }
1260 }
1261 else if (col > tok->indstack[tok->indent]) {
1262 /* Indent -- always one */
1263 if (tok->indent+1 >= MAXINDENT) {
1264 tok->done = E_TOODEEP;
1265 tok->cur = tok->inp;
1266 return ERRORTOKEN;
1267 }
1268 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001269 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 }
1271 tok->pendin++;
1272 tok->indstack[++tok->indent] = col;
1273 tok->altindstack[tok->indent] = altcol;
1274 }
1275 else /* col < tok->indstack[tok->indent] */ {
1276 /* Dedent -- any number, must be consistent */
1277 while (tok->indent > 0 &&
1278 col < tok->indstack[tok->indent]) {
1279 tok->pendin--;
1280 tok->indent--;
1281 }
1282 if (col != tok->indstack[tok->indent]) {
1283 tok->done = E_DEDENT;
1284 tok->cur = tok->inp;
1285 return ERRORTOKEN;
1286 }
1287 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001288 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 }
1290 }
1291 }
1292 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001293
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001295
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 /* Return pending indents/dedents */
1297 if (tok->pendin != 0) {
1298 if (tok->pendin < 0) {
1299 tok->pendin++;
1300 return DEDENT;
1301 }
1302 else {
1303 tok->pendin--;
1304 return INDENT;
1305 }
1306 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001307
Guido van Rossum495da292019-03-07 12:38:08 -08001308 /* Peek ahead at the next character */
1309 c = tok_nextc(tok);
1310 tok_backup(tok, c);
1311 /* Check if we are closing an async function */
1312 if (tok->async_def
1313 && !blankline
1314 /* Due to some implementation artifacts of type comments,
1315 * a TYPE_COMMENT at the start of a function won't set an
1316 * indentation level and it will produce a NEWLINE after it.
1317 * To avoid spuriously ending an async function due to this,
1318 * wait until we have some non-newline char in front of us. */
1319 && c != '\n'
1320 && tok->level == 0
1321 /* There was a NEWLINE after ASYNC DEF,
1322 so we're past the signature. */
1323 && tok->async_def_nl
1324 /* Current indentation level is less than where
1325 the async function was defined */
1326 && tok->async_def_indent >= tok->indent)
1327 {
1328 tok->async_def = 0;
1329 tok->async_def_indent = 0;
1330 tok->async_def_nl = 0;
1331 }
1332
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 tok->start = NULL;
1335 /* Skip spaces */
1336 do {
1337 c = tok_nextc(tok);
1338 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001339
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 /* Set start of current token */
1341 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001342
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001343 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001344 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001345 const char *prefix, *p, *type_start;
1346
Brett Cannona721aba2016-09-09 14:57:09 -07001347 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001349 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001350
1351 if (tok->type_comments) {
1352 p = tok->start;
1353 prefix = type_comment_prefix;
1354 while (*prefix && p < tok->cur) {
1355 if (*prefix == ' ') {
1356 while (*p == ' ' || *p == '\t') {
1357 p++;
1358 }
1359 } else if (*prefix == *p) {
1360 p++;
1361 } else {
1362 break;
1363 }
1364
1365 prefix++;
1366 }
1367
1368 /* This is a type comment if we matched all of type_comment_prefix. */
1369 if (!*prefix) {
1370 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001371 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001372 tok_backup(tok, c); /* don't eat the newline or EOF */
1373
1374 type_start = p;
1375
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001376 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001377 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001378 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001379 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001380 && !(tok->cur > ignore_end
1381 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001382
1383 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001384 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001385 *p_end = tok->cur;
1386
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001387 /* If this type ignore is the only thing on the line, consume the newline also. */
1388 if (blankline) {
1389 tok_nextc(tok);
1390 tok->atbol = 1;
1391 }
1392 return TYPE_IGNORE;
1393 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001394 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001395 *p_end = tok->cur;
1396 return TYPE_COMMENT;
1397 }
1398 }
1399 }
Brett Cannona721aba2016-09-09 14:57:09 -07001400 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 /* Check for EOF and errors now */
1403 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001404 if (tok->level) {
1405 return ERRORTOKEN;
1406 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1408 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 /* Identifier (most frequent token!) */
1411 nonascii = 0;
1412 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001413 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001414 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001415 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001416 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001417 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001418 /* Since this is a backwards compatibility support literal we don't
1419 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001420 else if (!(saw_b || saw_u || saw_r || saw_f)
1421 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001422 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001423 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001424 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001425 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001426 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001427 }
1428 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001429 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001430 }
1431 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001432 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001433 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001434 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001435 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001436 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001437 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 }
1439 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001440 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001442 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 c = tok_nextc(tok);
1444 }
1445 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001446 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001448 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001449
1450 *p_start = tok->start;
1451 *p_end = tok->cur;
1452
Guido van Rossum495da292019-03-07 12:38:08 -08001453 /* async/await parsing block. */
1454 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1455 /* May be an 'async' or 'await' token. For Python 3.7 or
1456 later we recognize them unconditionally. For Python
1457 3.5 or 3.6 we recognize 'async' in front of 'def', and
1458 either one inside of 'async def'. (Technically we
1459 shouldn't recognize these at all for 3.4 or earlier,
1460 but there's no *valid* Python 3.4 code that would be
1461 rejected, and async functions will be rejected in a
1462 later phase.) */
1463 if (!tok->async_hacks || tok->async_def) {
1464 /* Always recognize the keywords. */
1465 if (memcmp(tok->start, "async", 5) == 0) {
1466 return ASYNC;
1467 }
1468 if (memcmp(tok->start, "await", 5) == 0) {
1469 return AWAIT;
1470 }
1471 }
1472 else if (memcmp(tok->start, "async", 5) == 0) {
1473 /* The current token is 'async'.
1474 Look ahead one token to see if that is 'def'. */
1475
1476 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001477 const char *ahead_tok_start = NULL;
1478 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001479 int ahead_tok_kind;
1480
1481 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1482 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1483 &ahead_tok_end);
1484
1485 if (ahead_tok_kind == NAME
1486 && ahead_tok.cur - ahead_tok.start == 3
1487 && memcmp(ahead_tok.start, "def", 3) == 0)
1488 {
1489 /* The next token is going to be 'def', so instead of
1490 returning a plain NAME token, return ASYNC. */
1491 tok->async_def_indent = tok->indent;
1492 tok->async_def = 1;
1493 return ASYNC;
1494 }
1495 }
1496 }
1497
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001498 return NAME;
1499 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001500
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 /* Newline */
1502 if (c == '\n') {
1503 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001504 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001506 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 *p_start = tok->start;
1508 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1509 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001510 if (tok->async_def) {
1511 /* We're somewhere inside an 'async def' function, and
1512 we've encountered a NEWLINE after its signature. */
1513 tok->async_def_nl = 1;
1514 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 return NEWLINE;
1516 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001517
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001518 /* Period or number starting with period? */
1519 if (c == '.') {
1520 c = tok_nextc(tok);
1521 if (isdigit(c)) {
1522 goto fraction;
1523 } else if (c == '.') {
1524 c = tok_nextc(tok);
1525 if (c == '.') {
1526 *p_start = tok->start;
1527 *p_end = tok->cur;
1528 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001529 }
1530 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 tok_backup(tok, c);
1532 }
1533 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001534 }
1535 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001536 tok_backup(tok, c);
1537 }
1538 *p_start = tok->start;
1539 *p_end = tok->cur;
1540 return DOT;
1541 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001542
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 /* Number */
1544 if (isdigit(c)) {
1545 if (c == '0') {
1546 /* Hex, octal or binary -- maybe. */
1547 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001548 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 /* Hex */
1550 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001552 if (c == '_') {
1553 c = tok_nextc(tok);
1554 }
1555 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001556 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001557 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001558 }
1559 do {
1560 c = tok_nextc(tok);
1561 } while (isxdigit(c));
1562 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 }
1564 else if (c == 'o' || c == 'O') {
1565 /* Octal */
1566 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001568 if (c == '_') {
1569 c = tok_nextc(tok);
1570 }
1571 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001572 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001573 if (isdigit(c)) {
1574 return syntaxerror(tok,
1575 "invalid digit '%c' in octal literal", c);
1576 }
1577 else {
1578 return syntaxerror(tok, "invalid octal literal");
1579 }
Brett Cannona721aba2016-09-09 14:57:09 -07001580 }
1581 do {
1582 c = tok_nextc(tok);
1583 } while ('0' <= c && c < '8');
1584 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001585 if (isdigit(c)) {
1586 return syntaxerror(tok,
1587 "invalid digit '%c' in octal literal", c);
1588 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 }
1590 else if (c == 'b' || c == 'B') {
1591 /* Binary */
1592 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001594 if (c == '_') {
1595 c = tok_nextc(tok);
1596 }
1597 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001598 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001599 if (isdigit(c)) {
1600 return syntaxerror(tok,
1601 "invalid digit '%c' in binary literal", c);
1602 }
1603 else {
1604 return syntaxerror(tok, "invalid binary literal");
1605 }
Brett Cannona721aba2016-09-09 14:57:09 -07001606 }
1607 do {
1608 c = tok_nextc(tok);
1609 } while (c == '0' || c == '1');
1610 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001611 if (isdigit(c)) {
1612 return syntaxerror(tok,
1613 "invalid digit '%c' in binary literal", c);
1614 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001615 }
1616 else {
1617 int nonzero = 0;
1618 /* maybe old-style octal; c is first char of it */
1619 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001620 while (1) {
1621 if (c == '_') {
1622 c = tok_nextc(tok);
1623 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001624 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001625 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001626 }
1627 }
1628 if (c != '0') {
1629 break;
1630 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 c = tok_nextc(tok);
1632 }
Brett Cannona721aba2016-09-09 14:57:09 -07001633 if (isdigit(c)) {
1634 nonzero = 1;
1635 c = tok_decimal_tail(tok);
1636 if (c == 0) {
1637 return ERRORTOKEN;
1638 }
1639 }
1640 if (c == '.') {
1641 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001643 }
1644 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001646 }
1647 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001649 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001651 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001652 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001653 return syntaxerror(tok,
1654 "leading zeros in decimal integer "
1655 "literals are not permitted; "
1656 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001657 }
1658 }
1659 }
1660 else {
1661 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001662 c = tok_decimal_tail(tok);
1663 if (c == 0) {
1664 return ERRORTOKEN;
1665 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001666 {
1667 /* Accept floating point numbers. */
1668 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001669 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 fraction:
1671 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001672 if (isdigit(c)) {
1673 c = tok_decimal_tail(tok);
1674 if (c == 0) {
1675 return ERRORTOKEN;
1676 }
1677 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 }
1679 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001680 int e;
1681 exponent:
1682 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 /* Exponent part */
1684 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001685 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001687 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001688 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001689 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001690 }
1691 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001693 tok_backup(tok, e);
1694 *p_start = tok->start;
1695 *p_end = tok->cur;
1696 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 }
Brett Cannona721aba2016-09-09 14:57:09 -07001698 c = tok_decimal_tail(tok);
1699 if (c == 0) {
1700 return ERRORTOKEN;
1701 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 }
Brett Cannona721aba2016-09-09 14:57:09 -07001703 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 /* Imaginary part */
1705 imaginary:
1706 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001707 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 }
1709 }
1710 tok_backup(tok, c);
1711 *p_start = tok->start;
1712 *p_end = tok->cur;
1713 return NUMBER;
1714 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001715
1716 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 /* String */
1718 if (c == '\'' || c == '"') {
1719 int quote = c;
1720 int quote_size = 1; /* 1 or 3 */
1721 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001722
Anthony Sottile995d9b92019-01-12 20:05:13 -08001723 /* Nodes of type STRING, especially multi line strings
1724 must be handled differently in order to get both
1725 the starting line number and the column offset right.
1726 (cf. issue 16806) */
1727 tok->first_lineno = tok->lineno;
1728 tok->multi_line_start = tok->line_start;
1729
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 /* Find the quote size and start of string */
1731 c = tok_nextc(tok);
1732 if (c == quote) {
1733 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001734 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001736 }
1737 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001739 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 }
Brett Cannona721aba2016-09-09 14:57:09 -07001741 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001743 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001744
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 /* Get rest of string */
1746 while (end_quote_size != quote_size) {
1747 c = tok_nextc(tok);
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001748 if (c == EOF || (quote_size == 1 && c == '\n')) {
1749 // shift the tok_state's location into
1750 // the start of string, and report the error
1751 // from the initial quote character
1752 tok->cur = (char *)tok->start;
1753 tok->cur++;
1754 tok->line_start = tok->multi_line_start;
1755 int start = tok->lineno;
1756 tok->lineno = tok->first_lineno;
1757
Brett Cannona721aba2016-09-09 14:57:09 -07001758 if (quote_size == 3) {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001759 return syntaxerror(tok,
1760 "unterminated triple-quoted string literal"
1761 " (detected at line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001762 }
1763 else {
Batuhan Taskayaa698d522021-01-21 00:38:47 +03001764 return syntaxerror(tok,
1765 "unterminated string literal (detected at"
1766 " line %d)", start);
Brett Cannona721aba2016-09-09 14:57:09 -07001767 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 }
Brett Cannona721aba2016-09-09 14:57:09 -07001769 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001770 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001771 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 else {
1773 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001774 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001775 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001776 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 }
1778 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001779
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001780 *p_start = tok->start;
1781 *p_end = tok->cur;
1782 return STRING;
1783 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001784
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001785 /* Line continuation */
1786 if (c == '\\') {
1787 c = tok_nextc(tok);
1788 if (c != '\n') {
1789 tok->done = E_LINECONT;
1790 tok->cur = tok->inp;
1791 return ERRORTOKEN;
1792 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001793 c = tok_nextc(tok);
1794 if (c == EOF) {
1795 tok->done = E_EOF;
1796 tok->cur = tok->inp;
1797 return ERRORTOKEN;
1798 } else {
1799 tok_backup(tok, c);
1800 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001801 tok->cont_line = 1;
1802 goto again; /* Read next line */
1803 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001804
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001805 /* Check for two-character token */
1806 {
1807 int c2 = tok_nextc(tok);
1808 int token = PyToken_TwoChars(c, c2);
1809 if (token != OP) {
1810 int c3 = tok_nextc(tok);
1811 int token3 = PyToken_ThreeChars(c, c2, c3);
1812 if (token3 != OP) {
1813 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001814 }
1815 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001816 tok_backup(tok, c3);
1817 }
1818 *p_start = tok->start;
1819 *p_end = tok->cur;
1820 return token;
1821 }
1822 tok_backup(tok, c2);
1823 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001824
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001825 /* Keep track of parentheses nesting level */
1826 switch (c) {
1827 case '(':
1828 case '[':
1829 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001830 if (tok->level >= MAXLEVEL) {
1831 return syntaxerror(tok, "too many nested parentheses");
1832 }
1833 tok->parenstack[tok->level] = c;
1834 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindoae7d3cd92021-01-20 12:53:52 +00001835 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001836 tok->level++;
1837 break;
1838 case ')':
1839 case ']':
1840 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001841 if (!tok->level) {
1842 return syntaxerror(tok, "unmatched '%c'", c);
1843 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001844 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001845 int opening = tok->parenstack[tok->level];
1846 if (!((opening == '(' && c == ')') ||
1847 (opening == '[' && c == ']') ||
1848 (opening == '{' && c == '}')))
1849 {
1850 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1851 return syntaxerror(tok,
1852 "closing parenthesis '%c' does not match "
1853 "opening parenthesis '%c' on line %d",
1854 c, opening, tok->parenlinenostack[tok->level]);
1855 }
1856 else {
1857 return syntaxerror(tok,
1858 "closing parenthesis '%c' does not match "
1859 "opening parenthesis '%c'",
1860 c, opening);
1861 }
1862 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001863 break;
1864 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001865
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001866 /* Punctuation character */
1867 *p_start = tok->start;
1868 *p_end = tok->cur;
1869 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001870}
1871
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001872int
Andy Lester384f3c52020-02-27 20:44:52 -06001873PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001874{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001875 int result = tok_get(tok, p_start, p_end);
1876 if (tok->decoding_erred) {
1877 result = ERRORTOKEN;
1878 tok->done = E_DECODE;
1879 }
1880 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001881}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001882
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001883/* Get the encoding of a Python file. Check for the coding cookie and check if
1884 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001885
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001886 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1887 encoding in the first or second line of the file (in which case the encoding
1888 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001889
Victor Stinner00d7abd2020-12-01 09:56:42 +01001890 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001891 by the caller. */
1892
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001893char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001894PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001895{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001896 struct tok_state *tok;
1897 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001898 const char *p_start = NULL;
1899 const char *p_end = NULL;
1900 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001901
Victor Stinnerdaf45552013-08-28 00:53:59 +02001902 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001903 if (fd < 0) {
1904 return NULL;
1905 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001906
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001907 fp = fdopen(fd, "r");
1908 if (fp == NULL) {
1909 return NULL;
1910 }
1911 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1912 if (tok == NULL) {
1913 fclose(fp);
1914 return NULL;
1915 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001916 if (filename != NULL) {
1917 Py_INCREF(filename);
1918 tok->filename = filename;
1919 }
1920 else {
1921 tok->filename = PyUnicode_FromString("<string>");
1922 if (tok->filename == NULL) {
1923 fclose(fp);
1924 PyTokenizer_Free(tok);
1925 return encoding;
1926 }
1927 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001928 while (tok->lineno < 2 && tok->done == E_OK) {
1929 PyTokenizer_Get(tok, &p_start, &p_end);
1930 }
1931 fclose(fp);
1932 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01001933 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Pablo Galindo261a4522021-03-28 23:48:05 +01001934 if (encoding) {
Hansraj Das69f37bc2019-08-15 21:49:07 +05301935 strcpy(encoding, tok->encoding);
Pablo Galindo261a4522021-03-28 23:48:05 +01001936 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001937 }
1938 PyTokenizer_Free(tok);
1939 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001940}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001941
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001942char *
1943PyTokenizer_FindEncoding(int fd)
1944{
1945 return PyTokenizer_FindEncodingFilename(fd, NULL);
1946}
1947
Guido van Rossum408027e1996-12-30 16:17:54 +00001948#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001949
1950void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001951tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001952{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001953 printf("%s", _PyParser_TokenNames[type]);
1954 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1955 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001956}
1957
1958#endif