blob: f9c8bf652cdfb3b27fd95d468b8398eac6e06c5a [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035
Guido van Rossum4fe87291992-02-26 15:24:44 +000036/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000037#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038
Guido van Rossum3f5da241990-12-20 15:06:42 +000039/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000040static struct tok_state *tok_new(void);
41static int tok_nextc(struct tok_state *tok);
42static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000043
Brett Cannond5ec98c2007-10-20 02:54:14 +000044
Guido van Rossumdcfcd142019-01-31 03:40:27 -080045/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47static const char* type_comment_prefix = "# type: ";
48
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000049/* Create and initialize a new tok_state structure */
50
51static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000052tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053{
Victor Stinner00d7abd2020-12-01 09:56:42 +010054 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000055 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060058 tok->buf = tok->cur = tok->inp = NULL;
59 tok->start = NULL;
60 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000061 tok->done = E_OK;
62 tok->fp = NULL;
63 tok->input = NULL;
64 tok->tabsize = TABSIZE;
65 tok->indent = 0;
66 tok->indstack[0] = 0;
67 tok->atbol = 1;
68 tok->pendin = 0;
69 tok->prompt = tok->nextprompt = NULL;
70 tok->lineno = 0;
71 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000072 tok->altindstack[0] = 0;
73 tok->decoding_state = STATE_INIT;
74 tok->decoding_erred = 0;
75 tok->read_coding_spec = 0;
76 tok->enc = NULL;
77 tok->encoding = NULL;
78 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020079 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000080 tok->decoding_readline = NULL;
81 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080082 tok->type_comments = 0;
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +020083 tok->stdin_content = NULL;
Yury Selivanov96ec9342015-07-23 15:01:58 +030084
Guido van Rossum495da292019-03-07 12:38:08 -080085 tok->async_hacks = 0;
86 tok->async_def = 0;
87 tok->async_def_indent = 0;
88 tok->async_def_nl = 0;
89
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000091}
92
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000093static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070094new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095{
Victor Stinner00d7abd2020-12-01 09:56:42 +010096 char* result = (char *)PyMem_Malloc(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 memcpy(result, s, len);
102 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000104}
105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100111 PyMem_Free(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (strncmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100187 PyMem_Free(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000211
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200212 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700217 if (!get_coding_spec(line, &cs, size, tok))
218 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200219 if (!cs) {
220 Py_ssize_t i;
221 for (i = 0; i < size; i++) {
222 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223 break;
224 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225 /* Stop checking coding spec after a line containing
226 * anything except a comment. */
227 tok->read_coding_spec = 1;
228 break;
229 }
230 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700231 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200232 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 tok->read_coding_spec = 1;
234 if (tok->encoding == NULL) {
235 assert(tok->decoding_state == STATE_RAW);
236 if (strcmp(cs, "utf-8") == 0) {
237 tok->encoding = cs;
238 } else {
239 r = set_readline(tok, cs);
240 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700244 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300245 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 "encoding problem: %s", cs);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100247 PyMem_Free(cs);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700248 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 } else { /* then, compare cs with BOM */
251 r = (strcmp(tok->encoding, cs) == 0);
252 if (!r)
253 PyErr_Format(PyExc_SyntaxError,
254 "encoding problem: %s with BOM", cs);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100255 PyMem_Free(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258}
259
260/* See whether the file starts with a BOM. If it does,
261 invoke the set_readline function with the new encoding.
262 Return 1 on success, 0 on failure. */
263
264static int
265check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 void unget_char(int, struct tok_state *),
267 int set_readline(struct tok_state *, const char *),
268 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 int ch1, ch2, ch3;
271 ch1 = get_char(tok);
272 tok->decoding_state = STATE_RAW;
273 if (ch1 == EOF) {
274 return 1;
275 } else if (ch1 == 0xEF) {
276 ch2 = get_char(tok);
277 if (ch2 != 0xBB) {
278 unget_char(ch2, tok);
279 unget_char(ch1, tok);
280 return 1;
281 }
282 ch3 = get_char(tok);
283 if (ch3 != 0xBF) {
284 unget_char(ch3, tok);
285 unget_char(ch2, tok);
286 unget_char(ch1, tok);
287 return 1;
288 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 /* Disable support for UTF-16 BOMs until a decision
291 is made whether this needs to be supported. */
292 } else if (ch1 == 0xFE) {
293 ch2 = get_char(tok);
294 if (ch2 != 0xFF) {
295 unget_char(ch2, tok);
296 unget_char(ch1, tok);
297 return 1;
298 }
299 if (!set_readline(tok, "utf-16-be"))
300 return 0;
301 tok->decoding_state = STATE_NORMAL;
302 } else if (ch1 == 0xFF) {
303 ch2 = get_char(tok);
304 if (ch2 != 0xFE) {
305 unget_char(ch2, tok);
306 unget_char(ch1, tok);
307 return 1;
308 }
309 if (!set_readline(tok, "utf-16-le"))
310 return 0;
311 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000312#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000313 } else {
314 unget_char(ch1, tok);
315 return 1;
316 }
317 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100318 PyMem_Free(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 tok->encoding = new_string("utf-8", 5, tok);
320 if (!tok->encoding)
321 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 /* No need to set_readline: input is already utf-8 */
323 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000324}
325
326/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000327 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000328
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000329 On entry, tok->decoding_buffer will be one of:
330 1) NULL: need to call tok->decoding_readline to get a new line
331 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000333 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 (in the s buffer) to copy entire contents of the line read
335 by tok->decoding_readline. tok->decoding_buffer has the overflow.
336 In this case, fp_readl is called in a loop (with an expanded buffer)
337 until the buffer ends with a '\n' (or until the end of the file is
338 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000339*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340
341static char *
342fp_readl(char *s, int size, struct tok_state *tok)
343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 PyObject* bufobj;
345 const char *buf;
346 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 /* Ask for one less byte so we can terminate it */
349 assert(size > 0);
350 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 if (tok->decoding_buffer) {
353 bufobj = tok->decoding_buffer;
354 Py_INCREF(bufobj);
355 }
356 else
357 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100358 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 if (bufobj == NULL)
360 goto error;
361 }
362 if (PyUnicode_CheckExact(bufobj))
363 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200364 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 if (buf == NULL) {
366 goto error;
367 }
368 }
369 else
370 {
371 buf = PyByteArray_AsString(bufobj);
372 if (buf == NULL) {
373 goto error;
374 }
375 buflen = PyByteArray_GET_SIZE(bufobj);
376 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 Py_XDECREF(tok->decoding_buffer);
379 if (buflen > size) {
380 /* Too many chars, the rest goes into tok->decoding_buffer */
381 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382 buflen-size);
383 if (tok->decoding_buffer == NULL)
384 goto error;
385 buflen = size;
386 }
387 else
388 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 memcpy(s, buf, buflen);
391 s[buflen] = '\0';
392 if (buflen == 0) /* EOF */
393 s = NULL;
394 Py_DECREF(bufobj);
395 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000396
397error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 Py_XDECREF(bufobj);
399 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400}
401
402/* Set the readline function for TOK to a StreamReader's
403 readline function. The StreamReader is named ENC.
404
405 This function is called from check_bom and check_coding_spec.
406
407 ENC is usually identical to the future value of tok->encoding,
408 except for the (currently unsupported) case of UTF-16.
409
410 Return 1 on success, 0 on failure. */
411
412static int
413fp_setreadl(struct tok_state *tok, const char* enc)
414{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700415 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200416 _Py_IDENTIFIER(open);
417 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000418 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200419 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420
Victor Stinner22a351a2010-10-14 12:04:34 +0000421 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200422 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100423 * position of tok->fp. If tok->fp was opened in text mode on Windows,
424 * its file position counts CRLF as one char and can't be directly mapped
425 * to the file offset for fd. Instead we step back one byte and read to
426 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200427 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100428 if (pos == -1 ||
429 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000430 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700431 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000432 }
433
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700434 io = PyImport_ImportModuleNoBlock("io");
435 if (io == NULL)
436 return 0;
437
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200438 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000439 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700440 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700442 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200444 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700445 Py_DECREF(stream);
446 if (readline == NULL)
447 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300448 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100450 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100451 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700452 if (bufobj == NULL)
453 return 0;
454 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100455 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700457 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458}
459
460/* Fetch the next byte from TOK. */
461
462static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464}
465
466/* Unfetch the last byte back into TOK. */
467
468static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470}
471
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000472/* Check whether the characters at s start a valid
473 UTF-8 sequence. Return the number of characters forming
474 the sequence if yes, 0 if not. */
475static int valid_utf8(const unsigned char* s)
476{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 int expected = 0;
478 int length;
479 if (*s < 0x80)
480 /* single-byte code */
481 return 1;
482 if (*s < 0xc0)
483 /* following byte */
484 return 0;
485 if (*s < 0xE0)
486 expected = 1;
487 else if (*s < 0xF0)
488 expected = 2;
489 else if (*s < 0xF8)
490 expected = 3;
491 else
492 return 0;
493 length = expected + 1;
494 for (; expected; expected--)
495 if (s[expected] < 0x80 || s[expected] >= 0xC0)
496 return 0;
497 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000498}
499
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500/* Read a line of input from TOK. Determine encoding
501 if necessary. */
502
503static char *
504decoding_fgets(char *s, int size, struct tok_state *tok)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 char *line = NULL;
507 int badchar = 0;
508 for (;;) {
509 if (tok->decoding_state == STATE_NORMAL) {
510 /* We already have a codec associated with
511 this input. */
512 line = fp_readl(s, size, tok);
513 break;
514 } else if (tok->decoding_state == STATE_RAW) {
515 /* We want a 'raw' read. */
516 line = Py_UniversalNewlineFgets(s, size,
517 tok->fp, NULL);
518 break;
519 } else {
520 /* We have not yet determined the encoding.
521 If an encoding is found, use the file-pointer
522 reader functions from now on. */
523 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524 return error_ret(tok);
525 assert(tok->decoding_state != STATE_INIT);
526 }
527 }
528 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530 return error_ret(tok);
531 }
532 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000533 /* The default encoding is UTF-8, so make sure we don't have any
534 non-UTF-8 sequences in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 int length;
538 for (c = (unsigned char *)line; *c; c += length)
539 if (!(length = valid_utf8(c))) {
540 badchar = *c;
541 break;
542 }
543 }
544 if (badchar) {
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200547 PyErr_Format(PyExc_SyntaxError,
548 "Non-UTF-8 code starting with '\\x%.2x' "
549 "in file %U on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000553 return error_ret(tok);
554 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000555 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556}
557
558static int
559decoding_feof(struct tok_state *tok)
560{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 if (tok->decoding_state != STATE_NORMAL) {
562 return feof(tok->fp);
563 } else {
564 PyObject* buf = tok->decoding_buffer;
565 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100566 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000567 if (buf == NULL) {
568 error_ret(tok);
569 return 1;
570 } else {
571 tok->decoding_buffer = buf;
572 }
573 }
574 return PyObject_Length(buf) == 0;
575 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576}
577
578/* Fetch a byte from TOK, using the string buffer. */
579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000580static int
581buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000582 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583}
584
585/* Unfetch a byte from TOK, using the string buffer. */
586
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000587static void
588buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000589 tok->str--;
590 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
593/* Set the readline function for TOK to ENC. For the string-based
594 tokenizer, this means to just record the encoding. */
595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000596static int
597buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 tok->enc = enc;
599 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600}
601
602/* Return a UTF-8 encoding Python string object from the
603 C byte string STR, which is encoded with ENC. */
604
605static PyObject *
606translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 PyObject *utf8;
608 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609 if (buf == NULL)
610 return NULL;
611 utf8 = PyUnicode_AsUTF8String(buf);
612 Py_DECREF(buf);
613 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614}
615
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000616
617static char *
618translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200619 int skip_next_lf = 0;
620 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 char *buf, *current;
622 char c = '\0';
Victor Stinner00d7abd2020-12-01 09:56:42 +0100623 buf = PyMem_Malloc(needed_length);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 if (buf == NULL) {
625 tok->done = E_NOMEM;
626 return NULL;
627 }
628 for (current = buf; *s; s++, current++) {
629 c = *s;
630 if (skip_next_lf) {
631 skip_next_lf = 0;
632 if (c == '\n') {
633 c = *++s;
634 if (!c)
635 break;
636 }
637 }
638 if (c == '\r') {
639 skip_next_lf = 1;
640 c = '\n';
641 }
642 *current = c;
643 }
644 /* If this is exec input, add a newline to the end of the string if
645 there isn't one already. */
646 if (exec_input && c != '\n') {
647 *current = '\n';
648 current++;
649 }
650 *current = '\0';
651 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000652 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 /* should never fail */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100654 char* result = PyMem_Realloc(buf, final_length);
Pablo Galindocb90c892019-03-19 17:17:58 +0000655 if (result == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100656 PyMem_Free(buf);
Pablo Galindocb90c892019-03-19 17:17:58 +0000657 }
658 buf = result;
659 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000660 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000661}
662
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663/* Decode a byte string STR for use as the buffer of TOK.
664 Look for encoding declarations inside STR, and record them
665 inside TOK. */
666
Andy Lester384f3c52020-02-27 20:44:52 -0600667static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000668decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600671 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 const char *s;
673 const char *newl[2] = {NULL, NULL};
674 int lineno = 0;
675 tok->input = str = translate_newlines(input, single, tok);
676 if (str == NULL)
677 return NULL;
678 tok->enc = NULL;
679 tok->str = str;
680 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681 return error_ret(tok);
682 str = tok->str; /* string after BOM if any */
683 assert(str);
684 if (tok->enc != NULL) {
685 utf8 = translate_into_utf8(str, tok->enc);
686 if (utf8 == NULL)
687 return error_ret(tok);
688 str = PyBytes_AsString(utf8);
689 }
690 for (s = str;; s++) {
691 if (*s == '\0') break;
692 else if (*s == '\n') {
693 assert(lineno < 2);
694 newl[lineno] = s;
695 lineno++;
696 if (lineno == 2) break;
697 }
698 }
699 tok->enc = NULL;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
702 if (newl[0]) {
703 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200705 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707 tok, buf_setreadl))
708 return error_ret(tok);
709 }
710 }
711 if (tok->enc != NULL) {
712 assert(utf8 == NULL);
713 utf8 = translate_into_utf8(str, tok->enc);
714 if (utf8 == NULL)
715 return error_ret(tok);
716 str = PyBytes_AS_STRING(utf8);
717 }
718 assert(tok->decoding_buffer == NULL);
719 tok->decoding_buffer = utf8; /* CAUTION */
720 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000721}
722
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723/* Set up tokenizer for string */
724
725struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000726PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600729 char *decoded;
730
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 if (tok == NULL)
732 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600733 decoded = decode_str(str, exec_input, tok);
734 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 PyTokenizer_Free(tok);
736 return NULL;
737 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000738
Andy Lester384f3c52020-02-27 20:44:52 -0600739 tok->buf = tok->cur = tok->inp = decoded;
740 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742}
743
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000745PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000746{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600748 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 if (tok == NULL)
750 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600751 tok->input = translated = translate_newlines(str, exec_input, tok);
752 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 PyTokenizer_Free(tok);
754 return NULL;
755 }
756 tok->decoding_state = STATE_RAW;
757 tok->read_coding_spec = 1;
758 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600759 tok->str = translated;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100760 tok->encoding = (char *)PyMem_Malloc(6);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000761 if (!tok->encoding) {
762 PyTokenizer_Free(tok);
763 return NULL;
764 }
765 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000766
Andy Lester384f3c52020-02-27 20:44:52 -0600767 tok->buf = tok->cur = tok->inp = translated;
768 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000770}
771
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000772/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773
774struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300775PyTokenizer_FromFile(FILE *fp, const char* enc,
776 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 struct tok_state *tok = tok_new();
779 if (tok == NULL)
780 return NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100781 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 PyTokenizer_Free(tok);
783 return NULL;
784 }
785 tok->cur = tok->inp = tok->buf;
786 tok->end = tok->buf + BUFSIZ;
787 tok->fp = fp;
788 tok->prompt = ps1;
789 tok->nextprompt = ps2;
790 if (enc != NULL) {
791 /* Must copy encoding declaration since it
792 gets copied into the parse tree. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100793 tok->encoding = PyMem_Malloc(strlen(enc)+1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 if (!tok->encoding) {
795 PyTokenizer_Free(tok);
796 return NULL;
797 }
798 strcpy(tok->encoding, enc);
799 tok->decoding_state = STATE_NORMAL;
800 }
801 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
804
805/* Free a tok_state structure */
806
807void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000808PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 if (tok->encoding != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100811 PyMem_Free(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 Py_XDECREF(tok->decoding_readline);
813 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200814 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000815 if (tok->fp != NULL && tok->buf != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100816 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 if (tok->input)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100818 PyMem_Free(tok->input);
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +0200819 if (tok->stdin_content)
820 PyMem_Free(tok->stdin_content);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100821 PyMem_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822}
823
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824/* Get next char, updating state; error code goes into tok->done */
825
826static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200827tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 for (;;) {
830 if (tok->cur != tok->inp) {
831 return Py_CHARMASK(*tok->cur++); /* Fast path */
832 }
833 if (tok->done != E_OK)
834 return EOF;
835 if (tok->fp == NULL) {
836 char *end = strchr(tok->inp, '\n');
837 if (end != NULL)
838 end++;
839 else {
840 end = strchr(tok->inp, '\0');
841 if (end == tok->inp) {
842 tok->done = E_EOF;
843 return EOF;
844 }
845 }
846 if (tok->start == NULL)
847 tok->buf = tok->cur;
848 tok->line_start = tok->cur;
849 tok->lineno++;
850 tok->inp = end;
851 return Py_CHARMASK(*tok->cur++);
852 }
853 if (tok->prompt != NULL) {
854 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000855 if (newtok != NULL) {
856 char *translated = translate_newlines(newtok, 0, tok);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100857 PyMem_Free(newtok);
Victor Stinner89e34362011-01-07 18:47:22 +0000858 if (translated == NULL)
859 return EOF;
860 newtok = translated;
Lysandros Nikolaoue5fe5092021-01-14 23:36:30 +0200861 if (tok->stdin_content == NULL) {
862 tok->stdin_content = PyMem_Malloc(strlen(translated) + 1);
863 if (tok->stdin_content == NULL) {
864 tok->done = E_NOMEM;
865 return EOF;
866 }
867 sprintf(tok->stdin_content, "%s", translated);
868 }
869 else {
870 char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1);
871 if (new_str == NULL) {
872 tok->done = E_NOMEM;
873 return EOF;
874 }
875 sprintf(new_str, "%s%s", tok->stdin_content, translated);
876 PyMem_Free(tok->stdin_content);
877 tok->stdin_content = new_str;
878 }
Victor Stinner89e34362011-01-07 18:47:22 +0000879 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000880 if (tok->encoding && newtok && *newtok) {
881 /* Recode to UTF-8 */
882 Py_ssize_t buflen;
883 const char* buf;
884 PyObject *u = translate_into_utf8(newtok, tok->encoding);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100885 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000886 if (!u) {
887 tok->done = E_DECODE;
888 return EOF;
889 }
890 buflen = PyBytes_GET_SIZE(u);
891 buf = PyBytes_AS_STRING(u);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100892 newtok = PyMem_Malloc(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700893 if (newtok == NULL) {
894 Py_DECREF(u);
895 tok->done = E_NOMEM;
896 return EOF;
897 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 strcpy(newtok, buf);
899 Py_DECREF(u);
900 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 if (tok->nextprompt != NULL)
902 tok->prompt = tok->nextprompt;
903 if (newtok == NULL)
904 tok->done = E_INTR;
905 else if (*newtok == '\0') {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100906 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000907 tok->done = E_EOF;
908 }
909 else if (tok->start != NULL) {
910 size_t start = tok->start - tok->buf;
911 size_t oldlen = tok->cur - tok->buf;
912 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000913 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 char *buf = tok->buf;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100915 buf = (char *)PyMem_Realloc(buf, newlen+1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916 tok->lineno++;
917 if (buf == NULL) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100918 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000919 tok->buf = NULL;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100920 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921 tok->done = E_NOMEM;
922 return EOF;
923 }
924 tok->buf = buf;
925 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000926 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000927 tok->line_start = tok->cur;
928 strcpy(tok->buf + oldlen, newtok);
Victor Stinner00d7abd2020-12-01 09:56:42 +0100929 PyMem_Free(newtok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930 tok->inp = tok->buf + newlen;
931 tok->end = tok->inp + 1;
932 tok->start = tok->buf + start;
933 }
934 else {
935 tok->lineno++;
936 if (tok->buf != NULL)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100937 PyMem_Free(tok->buf);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000938 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 tok->cur = tok->buf;
940 tok->line_start = tok->buf;
941 tok->inp = strchr(tok->buf, '\0');
942 tok->end = tok->inp + 1;
943 }
944 }
945 else {
946 int done = 0;
947 Py_ssize_t cur = 0;
948 char *pt;
949 if (tok->start == NULL) {
950 if (tok->buf == NULL) {
951 tok->buf = (char *)
Victor Stinner00d7abd2020-12-01 09:56:42 +0100952 PyMem_Malloc(BUFSIZ);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 if (tok->buf == NULL) {
954 tok->done = E_NOMEM;
955 return EOF;
956 }
957 tok->end = tok->buf + BUFSIZ;
958 }
959 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
960 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200961 if (!tok->decoding_erred)
962 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 done = 1;
964 }
965 else {
966 tok->done = E_OK;
967 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700968 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000969 }
970 }
971 else {
972 cur = tok->cur - tok->buf;
973 if (decoding_feof(tok)) {
974 tok->done = E_EOF;
975 done = 1;
976 }
977 else
978 tok->done = E_OK;
979 }
980 tok->lineno++;
981 /* Read until '\n' or EOF */
982 while (!done) {
983 Py_ssize_t curstart = tok->start == NULL ? -1 :
984 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700985 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000986 Py_ssize_t curvalid = tok->inp - tok->buf;
987 Py_ssize_t newsize = curvalid + BUFSIZ;
988 char *newbuf = tok->buf;
Victor Stinner00d7abd2020-12-01 09:56:42 +0100989 newbuf = (char *)PyMem_Realloc(newbuf,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000990 newsize);
991 if (newbuf == NULL) {
992 tok->done = E_NOMEM;
993 tok->cur = tok->inp;
994 return EOF;
995 }
996 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200997 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700998 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200999 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 tok->inp = tok->buf + curvalid;
1001 tok->end = tok->buf + newsize;
1002 tok->start = curstart < 0 ? NULL :
1003 tok->buf + curstart;
1004 if (decoding_fgets(tok->inp,
1005 (int)(tok->end - tok->inp),
1006 tok) == NULL) {
1007 /* Break out early on decoding
1008 errors, as tok->buf will be NULL
1009 */
1010 if (tok->decoding_erred)
1011 return EOF;
1012 /* Last line does not end in \n,
1013 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -07001014 if (tok->inp[-1] != '\n')
1015 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 }
1017 tok->inp = strchr(tok->inp, '\0');
1018 done = tok->inp[-1] == '\n';
1019 }
1020 if (tok->buf != NULL) {
1021 tok->cur = tok->buf + cur;
1022 tok->line_start = tok->cur;
1023 /* replace "\r\n" with "\n" */
1024 /* For Mac leave the \r, giving a syntax error */
1025 pt = tok->inp - 2;
1026 if (pt >= tok->buf && *pt == '\r') {
1027 *pt++ = '\n';
1028 *pt = '\0';
1029 tok->inp = pt;
1030 }
1031 }
1032 }
1033 if (tok->done != E_OK) {
1034 if (tok->prompt != NULL)
1035 PySys_WriteStderr("\n");
1036 tok->cur = tok->inp;
1037 return EOF;
1038 }
1039 }
1040 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041}
1042
1043
1044/* Back-up one character */
1045
1046static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001047tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001049 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001050 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001051 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001052 }
1053 if (*tok->cur != c) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054 *tok->cur = c;
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001055 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057}
1058
1059
Guido van Rossum926f13a1998-04-09 21:38:06 +00001060static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001061syntaxerror(struct tok_state *tok, const char *format, ...)
1062{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001063 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001064 va_list vargs;
1065#ifdef HAVE_STDARG_PROTOTYPES
1066 va_start(vargs, format);
1067#else
1068 va_start(vargs);
1069#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001070 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001071 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001072 if (!errmsg) {
1073 goto error;
1074 }
1075
1076 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1077 "replace");
1078 if (!errtext) {
1079 goto error;
1080 }
1081 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1082 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1083 if (line_len != tok->cur - tok->line_start) {
1084 Py_DECREF(errtext);
1085 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1086 "replace");
1087 }
1088 if (!errtext) {
1089 goto error;
1090 }
1091
1092 args = Py_BuildValue("(O(OiiN))", errmsg,
1093 tok->filename, tok->lineno, offset, errtext);
1094 if (args) {
1095 PyErr_SetObject(PyExc_SyntaxError, args);
1096 Py_DECREF(args);
1097 }
1098
1099error:
1100 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001101 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001102 return ERRORTOKEN;
1103}
1104
1105static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001106indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001107{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001108 tok->done = E_TABSPACE;
1109 tok->cur = tok->inp;
1110 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001111}
1112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113/* Verify that the identifier follows PEP 3131.
1114 All identifier strings are guaranteed to be "ready" unicode objects.
1115 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001116static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001117verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001118{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 PyObject *s;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001120 if (tok->decoding_erred)
1121 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001123 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001124 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001125 tok->done = E_DECODE;
1126 }
1127 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 tok->done = E_ERROR;
1129 }
1130 return 0;
1131 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001132 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1133 if (invalid < 0) {
1134 Py_DECREF(s);
1135 tok->done = E_ERROR;
1136 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001137 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001138 assert(PyUnicode_GET_LENGTH(s) > 0);
1139 if (invalid < PyUnicode_GET_LENGTH(s)) {
1140 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1141 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1142 /* Determine the offset in UTF-8 encoded input */
1143 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1144 if (s != NULL) {
1145 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1146 }
1147 if (s == NULL) {
1148 tok->done = E_ERROR;
1149 return 0;
1150 }
1151 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1152 }
1153 Py_DECREF(s);
1154 // PyUnicode_FromFormatV() does not support %X
1155 char hex[9];
Victor Stinnere822e372020-06-15 21:59:47 +02001156 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001157 if (Py_UNICODE_ISPRINTABLE(ch)) {
1158 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1159 }
1160 else {
1161 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1162 }
1163 return 0;
1164 }
1165 Py_DECREF(s);
1166 return 1;
Martin v. Löwis47383402007-08-15 07:32:56 +00001167}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001168
Brett Cannona721aba2016-09-09 14:57:09 -07001169static int
1170tok_decimal_tail(struct tok_state *tok)
1171{
1172 int c;
1173
1174 while (1) {
1175 do {
1176 c = tok_nextc(tok);
1177 } while (isdigit(c));
1178 if (c != '_') {
1179 break;
1180 }
1181 c = tok_nextc(tok);
1182 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001183 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001184 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001185 return 0;
1186 }
1187 }
1188 return c;
1189}
1190
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191/* Get next token, after space stripping etc. */
1192
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001193static int
Andy Lester384f3c52020-02-27 20:44:52 -06001194tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001195{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001196 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001198
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001199 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 tok->start = NULL;
1202 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001203
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 /* Get indentation level */
1205 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001206 int col = 0;
1207 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 tok->atbol = 0;
1209 for (;;) {
1210 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001211 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001212 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001213 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001215 col = (col / tok->tabsize + 1) * tok->tabsize;
1216 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 }
Brett Cannona721aba2016-09-09 14:57:09 -07001218 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001220 }
1221 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001223 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 }
1225 tok_backup(tok, c);
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001226 if (c == '#' || c == '\n' || c == '\\') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 /* Lines with only whitespace and/or comments
Lysandros Nikolaou896f4cf2020-06-11 02:56:08 +03001228 and/or a line continuation character
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 shouldn't affect the indentation and are
1230 not passed to the parser as NEWLINE tokens,
1231 except *totally* empty lines in interactive
1232 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001233 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001235 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001236 else if (tok->prompt != NULL && tok->lineno == 1) {
1237 /* In interactive mode, if the first line contains
1238 only spaces and/or a comment, let it through. */
1239 blankline = 0;
1240 col = altcol = 0;
1241 }
Brett Cannona721aba2016-09-09 14:57:09 -07001242 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001244 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 /* We can't jump back right here since we still
1246 may need to skip to the end of a comment */
1247 }
1248 if (!blankline && tok->level == 0) {
1249 if (col == tok->indstack[tok->indent]) {
1250 /* No change */
1251 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001252 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 }
1254 }
1255 else if (col > tok->indstack[tok->indent]) {
1256 /* Indent -- always one */
1257 if (tok->indent+1 >= MAXINDENT) {
1258 tok->done = E_TOODEEP;
1259 tok->cur = tok->inp;
1260 return ERRORTOKEN;
1261 }
1262 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001263 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 }
1265 tok->pendin++;
1266 tok->indstack[++tok->indent] = col;
1267 tok->altindstack[tok->indent] = altcol;
1268 }
1269 else /* col < tok->indstack[tok->indent] */ {
1270 /* Dedent -- any number, must be consistent */
1271 while (tok->indent > 0 &&
1272 col < tok->indstack[tok->indent]) {
1273 tok->pendin--;
1274 tok->indent--;
1275 }
1276 if (col != tok->indstack[tok->indent]) {
1277 tok->done = E_DEDENT;
1278 tok->cur = tok->inp;
1279 return ERRORTOKEN;
1280 }
1281 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001282 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 }
1284 }
1285 }
1286 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001287
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001289
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 /* Return pending indents/dedents */
1291 if (tok->pendin != 0) {
1292 if (tok->pendin < 0) {
1293 tok->pendin++;
1294 return DEDENT;
1295 }
1296 else {
1297 tok->pendin--;
1298 return INDENT;
1299 }
1300 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001301
Guido van Rossum495da292019-03-07 12:38:08 -08001302 /* Peek ahead at the next character */
1303 c = tok_nextc(tok);
1304 tok_backup(tok, c);
1305 /* Check if we are closing an async function */
1306 if (tok->async_def
1307 && !blankline
1308 /* Due to some implementation artifacts of type comments,
1309 * a TYPE_COMMENT at the start of a function won't set an
1310 * indentation level and it will produce a NEWLINE after it.
1311 * To avoid spuriously ending an async function due to this,
1312 * wait until we have some non-newline char in front of us. */
1313 && c != '\n'
1314 && tok->level == 0
1315 /* There was a NEWLINE after ASYNC DEF,
1316 so we're past the signature. */
1317 && tok->async_def_nl
1318 /* Current indentation level is less than where
1319 the async function was defined */
1320 && tok->async_def_indent >= tok->indent)
1321 {
1322 tok->async_def = 0;
1323 tok->async_def_indent = 0;
1324 tok->async_def_nl = 0;
1325 }
1326
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 tok->start = NULL;
1329 /* Skip spaces */
1330 do {
1331 c = tok_nextc(tok);
1332 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001333
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 /* Set start of current token */
1335 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001336
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001337 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001338 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001339 const char *prefix, *p, *type_start;
1340
Brett Cannona721aba2016-09-09 14:57:09 -07001341 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001343 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001344
1345 if (tok->type_comments) {
1346 p = tok->start;
1347 prefix = type_comment_prefix;
1348 while (*prefix && p < tok->cur) {
1349 if (*prefix == ' ') {
1350 while (*p == ' ' || *p == '\t') {
1351 p++;
1352 }
1353 } else if (*prefix == *p) {
1354 p++;
1355 } else {
1356 break;
1357 }
1358
1359 prefix++;
1360 }
1361
1362 /* This is a type comment if we matched all of type_comment_prefix. */
1363 if (!*prefix) {
1364 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001365 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001366 tok_backup(tok, c); /* don't eat the newline or EOF */
1367
1368 type_start = p;
1369
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001370 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001371 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001372 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001373 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001374 && !(tok->cur > ignore_end
1375 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001376
1377 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001378 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001379 *p_end = tok->cur;
1380
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001381 /* If this type ignore is the only thing on the line, consume the newline also. */
1382 if (blankline) {
1383 tok_nextc(tok);
1384 tok->atbol = 1;
1385 }
1386 return TYPE_IGNORE;
1387 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001388 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001389 *p_end = tok->cur;
1390 return TYPE_COMMENT;
1391 }
1392 }
1393 }
Brett Cannona721aba2016-09-09 14:57:09 -07001394 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001395
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 /* Check for EOF and errors now */
1397 if (c == EOF) {
Pablo Galindod6d63712021-01-19 23:59:33 +00001398 if (tok->level) {
1399 return ERRORTOKEN;
1400 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1402 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001403
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 /* Identifier (most frequent token!) */
1405 nonascii = 0;
1406 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001407 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001408 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001409 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001410 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001411 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001412 /* Since this is a backwards compatibility support literal we don't
1413 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001414 else if (!(saw_b || saw_u || saw_r || saw_f)
1415 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001416 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001417 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001418 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001419 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001420 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001421 }
1422 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001423 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001424 }
1425 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001426 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001427 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001429 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001431 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 }
1433 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001434 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001436 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 c = tok_nextc(tok);
1438 }
1439 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001440 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001442 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001443
1444 *p_start = tok->start;
1445 *p_end = tok->cur;
1446
Guido van Rossum495da292019-03-07 12:38:08 -08001447 /* async/await parsing block. */
1448 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1449 /* May be an 'async' or 'await' token. For Python 3.7 or
1450 later we recognize them unconditionally. For Python
1451 3.5 or 3.6 we recognize 'async' in front of 'def', and
1452 either one inside of 'async def'. (Technically we
1453 shouldn't recognize these at all for 3.4 or earlier,
1454 but there's no *valid* Python 3.4 code that would be
1455 rejected, and async functions will be rejected in a
1456 later phase.) */
1457 if (!tok->async_hacks || tok->async_def) {
1458 /* Always recognize the keywords. */
1459 if (memcmp(tok->start, "async", 5) == 0) {
1460 return ASYNC;
1461 }
1462 if (memcmp(tok->start, "await", 5) == 0) {
1463 return AWAIT;
1464 }
1465 }
1466 else if (memcmp(tok->start, "async", 5) == 0) {
1467 /* The current token is 'async'.
1468 Look ahead one token to see if that is 'def'. */
1469
1470 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001471 const char *ahead_tok_start = NULL;
1472 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001473 int ahead_tok_kind;
1474
1475 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1476 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1477 &ahead_tok_end);
1478
1479 if (ahead_tok_kind == NAME
1480 && ahead_tok.cur - ahead_tok.start == 3
1481 && memcmp(ahead_tok.start, "def", 3) == 0)
1482 {
1483 /* The next token is going to be 'def', so instead of
1484 returning a plain NAME token, return ASYNC. */
1485 tok->async_def_indent = tok->indent;
1486 tok->async_def = 1;
1487 return ASYNC;
1488 }
1489 }
1490 }
1491
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001492 return NAME;
1493 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 /* Newline */
1496 if (c == '\n') {
1497 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001498 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001500 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 *p_start = tok->start;
1502 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1503 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001504 if (tok->async_def) {
1505 /* We're somewhere inside an 'async def' function, and
1506 we've encountered a NEWLINE after its signature. */
1507 tok->async_def_nl = 1;
1508 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001509 return NEWLINE;
1510 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001511
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001512 /* Period or number starting with period? */
1513 if (c == '.') {
1514 c = tok_nextc(tok);
1515 if (isdigit(c)) {
1516 goto fraction;
1517 } else if (c == '.') {
1518 c = tok_nextc(tok);
1519 if (c == '.') {
1520 *p_start = tok->start;
1521 *p_end = tok->cur;
1522 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001523 }
1524 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 tok_backup(tok, c);
1526 }
1527 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001528 }
1529 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001530 tok_backup(tok, c);
1531 }
1532 *p_start = tok->start;
1533 *p_end = tok->cur;
1534 return DOT;
1535 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001536
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 /* Number */
1538 if (isdigit(c)) {
1539 if (c == '0') {
1540 /* Hex, octal or binary -- maybe. */
1541 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 /* Hex */
1544 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001546 if (c == '_') {
1547 c = tok_nextc(tok);
1548 }
1549 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001550 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001551 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001552 }
1553 do {
1554 c = tok_nextc(tok);
1555 } while (isxdigit(c));
1556 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001557 }
1558 else if (c == 'o' || c == 'O') {
1559 /* Octal */
1560 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001562 if (c == '_') {
1563 c = tok_nextc(tok);
1564 }
1565 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001566 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001567 if (isdigit(c)) {
1568 return syntaxerror(tok,
1569 "invalid digit '%c' in octal literal", c);
1570 }
1571 else {
1572 return syntaxerror(tok, "invalid octal literal");
1573 }
Brett Cannona721aba2016-09-09 14:57:09 -07001574 }
1575 do {
1576 c = tok_nextc(tok);
1577 } while ('0' <= c && c < '8');
1578 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001579 if (isdigit(c)) {
1580 return syntaxerror(tok,
1581 "invalid digit '%c' in octal literal", c);
1582 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001583 }
1584 else if (c == 'b' || c == 'B') {
1585 /* Binary */
1586 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001588 if (c == '_') {
1589 c = tok_nextc(tok);
1590 }
1591 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001592 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001593 if (isdigit(c)) {
1594 return syntaxerror(tok,
1595 "invalid digit '%c' in binary literal", c);
1596 }
1597 else {
1598 return syntaxerror(tok, "invalid binary literal");
1599 }
Brett Cannona721aba2016-09-09 14:57:09 -07001600 }
1601 do {
1602 c = tok_nextc(tok);
1603 } while (c == '0' || c == '1');
1604 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001605 if (isdigit(c)) {
1606 return syntaxerror(tok,
1607 "invalid digit '%c' in binary literal", c);
1608 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 }
1610 else {
1611 int nonzero = 0;
1612 /* maybe old-style octal; c is first char of it */
1613 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001614 while (1) {
1615 if (c == '_') {
1616 c = tok_nextc(tok);
1617 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001618 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001619 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001620 }
1621 }
1622 if (c != '0') {
1623 break;
1624 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 c = tok_nextc(tok);
1626 }
Brett Cannona721aba2016-09-09 14:57:09 -07001627 if (isdigit(c)) {
1628 nonzero = 1;
1629 c = tok_decimal_tail(tok);
1630 if (c == 0) {
1631 return ERRORTOKEN;
1632 }
1633 }
1634 if (c == '.') {
1635 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001637 }
1638 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001639 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001640 }
1641 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001643 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001645 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001647 return syntaxerror(tok,
1648 "leading zeros in decimal integer "
1649 "literals are not permitted; "
1650 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 }
1652 }
1653 }
1654 else {
1655 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001656 c = tok_decimal_tail(tok);
1657 if (c == 0) {
1658 return ERRORTOKEN;
1659 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 {
1661 /* Accept floating point numbers. */
1662 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001663 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 fraction:
1665 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001666 if (isdigit(c)) {
1667 c = tok_decimal_tail(tok);
1668 if (c == 0) {
1669 return ERRORTOKEN;
1670 }
1671 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001672 }
1673 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001674 int e;
1675 exponent:
1676 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 /* Exponent part */
1678 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001679 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001681 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001682 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001683 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001684 }
1685 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001687 tok_backup(tok, e);
1688 *p_start = tok->start;
1689 *p_end = tok->cur;
1690 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691 }
Brett Cannona721aba2016-09-09 14:57:09 -07001692 c = tok_decimal_tail(tok);
1693 if (c == 0) {
1694 return ERRORTOKEN;
1695 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 }
Brett Cannona721aba2016-09-09 14:57:09 -07001697 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 /* Imaginary part */
1699 imaginary:
1700 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001701 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 }
1703 }
1704 tok_backup(tok, c);
1705 *p_start = tok->start;
1706 *p_end = tok->cur;
1707 return NUMBER;
1708 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001709
1710 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 /* String */
1712 if (c == '\'' || c == '"') {
1713 int quote = c;
1714 int quote_size = 1; /* 1 or 3 */
1715 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001716
Anthony Sottile995d9b92019-01-12 20:05:13 -08001717 /* Nodes of type STRING, especially multi line strings
1718 must be handled differently in order to get both
1719 the starting line number and the column offset right.
1720 (cf. issue 16806) */
1721 tok->first_lineno = tok->lineno;
1722 tok->multi_line_start = tok->line_start;
1723
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 /* Find the quote size and start of string */
1725 c = tok_nextc(tok);
1726 if (c == quote) {
1727 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001728 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001730 }
1731 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001733 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001734 }
Brett Cannona721aba2016-09-09 14:57:09 -07001735 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001737 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001738
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001739 /* Get rest of string */
1740 while (end_quote_size != quote_size) {
1741 c = tok_nextc(tok);
1742 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001743 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001745 }
1746 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001748 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 tok->cur = tok->inp;
1750 return ERRORTOKEN;
1751 }
1752 if (quote_size == 1 && c == '\n') {
1753 tok->done = E_EOLS;
1754 tok->cur = tok->inp;
1755 return ERRORTOKEN;
1756 }
Brett Cannona721aba2016-09-09 14:57:09 -07001757 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001759 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001760 else {
1761 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001762 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001763 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001764 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 }
1766 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001767
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 *p_start = tok->start;
1769 *p_end = tok->cur;
1770 return STRING;
1771 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001772
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001773 /* Line continuation */
1774 if (c == '\\') {
1775 c = tok_nextc(tok);
1776 if (c != '\n') {
1777 tok->done = E_LINECONT;
1778 tok->cur = tok->inp;
1779 return ERRORTOKEN;
1780 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001781 c = tok_nextc(tok);
1782 if (c == EOF) {
1783 tok->done = E_EOF;
1784 tok->cur = tok->inp;
1785 return ERRORTOKEN;
1786 } else {
1787 tok_backup(tok, c);
1788 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001789 tok->cont_line = 1;
1790 goto again; /* Read next line */
1791 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001792
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001793 /* Check for two-character token */
1794 {
1795 int c2 = tok_nextc(tok);
1796 int token = PyToken_TwoChars(c, c2);
1797 if (token != OP) {
1798 int c3 = tok_nextc(tok);
1799 int token3 = PyToken_ThreeChars(c, c2, c3);
1800 if (token3 != OP) {
1801 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001802 }
1803 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001804 tok_backup(tok, c3);
1805 }
1806 *p_start = tok->start;
1807 *p_end = tok->cur;
1808 return token;
1809 }
1810 tok_backup(tok, c2);
1811 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001812
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001813 /* Keep track of parentheses nesting level */
1814 switch (c) {
1815 case '(':
1816 case '[':
1817 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001818 if (tok->level >= MAXLEVEL) {
1819 return syntaxerror(tok, "too many nested parentheses");
1820 }
1821 tok->parenstack[tok->level] = c;
1822 tok->parenlinenostack[tok->level] = tok->lineno;
Pablo Galindod6d63712021-01-19 23:59:33 +00001823 tok->parencolstack[tok->level] = tok->start - tok->line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001824 tok->level++;
1825 break;
1826 case ')':
1827 case ']':
1828 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001829 if (!tok->level) {
1830 return syntaxerror(tok, "unmatched '%c'", c);
1831 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001832 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001833 int opening = tok->parenstack[tok->level];
1834 if (!((opening == '(' && c == ')') ||
1835 (opening == '[' && c == ']') ||
1836 (opening == '{' && c == '}')))
1837 {
1838 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1839 return syntaxerror(tok,
1840 "closing parenthesis '%c' does not match "
1841 "opening parenthesis '%c' on line %d",
1842 c, opening, tok->parenlinenostack[tok->level]);
1843 }
1844 else {
1845 return syntaxerror(tok,
1846 "closing parenthesis '%c' does not match "
1847 "opening parenthesis '%c'",
1848 c, opening);
1849 }
1850 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 break;
1852 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001853
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001854 /* Punctuation character */
1855 *p_start = tok->start;
1856 *p_end = tok->cur;
1857 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001858}
1859
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001860int
Andy Lester384f3c52020-02-27 20:44:52 -06001861PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001862{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001863 int result = tok_get(tok, p_start, p_end);
1864 if (tok->decoding_erred) {
1865 result = ERRORTOKEN;
1866 tok->done = E_DECODE;
1867 }
1868 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001869}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001870
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001871/* Get the encoding of a Python file. Check for the coding cookie and check if
1872 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001873
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001874 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1875 encoding in the first or second line of the file (in which case the encoding
1876 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001877
Victor Stinner00d7abd2020-12-01 09:56:42 +01001878 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001879 by the caller. */
1880
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001881char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001882PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001883{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001884 struct tok_state *tok;
1885 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001886 const char *p_start = NULL;
1887 const char *p_end = NULL;
1888 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001889
Victor Stinnerdaf45552013-08-28 00:53:59 +02001890 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001891 if (fd < 0) {
1892 return NULL;
1893 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001894
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001895 fp = fdopen(fd, "r");
1896 if (fp == NULL) {
1897 return NULL;
1898 }
1899 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1900 if (tok == NULL) {
1901 fclose(fp);
1902 return NULL;
1903 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001904 if (filename != NULL) {
1905 Py_INCREF(filename);
1906 tok->filename = filename;
1907 }
1908 else {
1909 tok->filename = PyUnicode_FromString("<string>");
1910 if (tok->filename == NULL) {
1911 fclose(fp);
1912 PyTokenizer_Free(tok);
1913 return encoding;
1914 }
1915 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001916 while (tok->lineno < 2 && tok->done == E_OK) {
1917 PyTokenizer_Get(tok, &p_start, &p_end);
1918 }
1919 fclose(fp);
1920 if (tok->encoding) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01001921 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001922 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301923 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001924 }
1925 PyTokenizer_Free(tok);
1926 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001927}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001928
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001929char *
1930PyTokenizer_FindEncoding(int fd)
1931{
1932 return PyTokenizer_FindEncodingFilename(fd, NULL);
1933}
1934
Guido van Rossum408027e1996-12-30 16:17:54 +00001935#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001936
1937void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001938tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001939{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001940 printf("%s", _PyParser_TokenNames[type]);
1941 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1942 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001943}
1944
1945#endif