blob: 95dfc5388037d02eeffcdf29239987b09a9a79b0 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Serhiy Storchakac6792272013-10-19 21:03:34 +030035extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000036/* Return malloc'ed string including trailing \n;
37 empty malloc'ed string for EOF;
38 NULL if interrupted */
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000044static struct tok_state *tok_new(void);
45static int tok_nextc(struct tok_state *tok);
46static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000047
Brett Cannond5ec98c2007-10-20 02:54:14 +000048
Guido van Rossumdcfcd142019-01-31 03:40:27 -080049/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50 tokenizing. */
51static const char* type_comment_prefix = "# type: ";
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Create and initialize a new tok_state structure */
54
55static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000056tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59 sizeof(struct tok_state));
60 if (tok == NULL)
61 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060062 tok->buf = tok->cur = tok->inp = NULL;
63 tok->start = NULL;
64 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065 tok->done = E_OK;
66 tok->fp = NULL;
67 tok->input = NULL;
68 tok->tabsize = TABSIZE;
69 tok->indent = 0;
70 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000072 tok->atbol = 1;
73 tok->pendin = 0;
74 tok->prompt = tok->nextprompt = NULL;
75 tok->lineno = 0;
76 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->altindstack[0] = 0;
78 tok->decoding_state = STATE_INIT;
79 tok->decoding_erred = 0;
80 tok->read_coding_spec = 0;
81 tok->enc = NULL;
82 tok->encoding = NULL;
83 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020084 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000085 tok->decoding_readline = NULL;
86 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080087 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030088
Guido van Rossum495da292019-03-07 12:38:08 -080089 tok->async_hacks = 0;
90 tok->async_def = 0;
91 tok->async_def_indent = 0;
92 tok->async_def_nl = 0;
93
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000094 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095}
96
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000097static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000099{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 if (!result) {
102 tok->done = E_NOMEM;
103 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700105 memcpy(result, s, len);
106 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000108}
109
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000110static char *
111error_ret(struct tok_state *tok) /* XXX */
112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 tok->decoding_erred = 1;
114 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
115 PyMem_FREE(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600116 tok->buf = tok->cur = tok->inp = NULL;
117 tok->start = NULL;
118 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200119 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121}
122
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000123
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200124static const char *
125get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 char buf[13];
128 int i;
129 for (i = 0; i < 12; i++) {
130 int c = s[i];
131 if (c == '\0')
132 break;
133 else if (c == '_')
134 buf[i] = '-';
135 else
136 buf[i] = tolower(c);
137 }
138 buf[i] = '\0';
139 if (strcmp(buf, "utf-8") == 0 ||
140 strncmp(buf, "utf-8-", 6) == 0)
141 return "utf-8";
142 else if (strcmp(buf, "latin-1") == 0 ||
143 strcmp(buf, "iso-8859-1") == 0 ||
144 strcmp(buf, "iso-latin-1") == 0 ||
145 strncmp(buf, "latin-1-", 8) == 0 ||
146 strncmp(buf, "iso-8859-1-", 11) == 0 ||
147 strncmp(buf, "iso-latin-1-", 12) == 0)
148 return "iso-8859-1";
149 else
150 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000151}
152
153/* Return the coding spec in S, or NULL if none is found. */
154
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155static int
156get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000157{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 /* Coding spec must be in a comment, and that comment must be
161 * the only statement on the source code line. */
162 for (i = 0; i < size - 6; i++) {
163 if (s[i] == '#')
164 break;
165 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 }
168 for (; i < size - 6; i++) { /* XXX inefficient search */
169 const char* t = s + i;
170 if (strncmp(t, "coding", 6) == 0) {
171 const char* begin = NULL;
172 t += 6;
173 if (t[0] != ':' && t[0] != '=')
174 continue;
175 do {
176 t++;
177 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 begin = t;
180 while (Py_ISALNUM(t[0]) ||
181 t[0] == '-' || t[0] == '_' || t[0] == '.')
182 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700185 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200186 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 if (!r)
188 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700189 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 if (r != q) {
191 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 r = new_string(q, strlen(q), tok);
193 if (!r)
194 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700196 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200197 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
199 }
200 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700201 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
204/* Check whether the line contains a coding spec. If it does,
205 invoke the set_readline function for the new encoding.
206 This function receives the tok_state and the new encoding.
207 Return 1 on success, 0 on failure. */
208
209static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000212{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000215
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200220 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700221 if (!get_coding_spec(line, &cs, size, tok))
222 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200223 if (!cs) {
224 Py_ssize_t i;
225 for (i = 0; i < size; i++) {
226 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
227 break;
228 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
229 /* Stop checking coding spec after a line containing
230 * anything except a comment. */
231 tok->read_coding_spec = 1;
232 break;
233 }
234 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700235 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200236 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 tok->read_coding_spec = 1;
238 if (tok->encoding == NULL) {
239 assert(tok->decoding_state == STATE_RAW);
240 if (strcmp(cs, "utf-8") == 0) {
241 tok->encoding = cs;
242 } else {
243 r = set_readline(tok, cs);
244 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700248 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300249 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 "encoding problem: %s", cs);
251 PyMem_FREE(cs);
252 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000253 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700254 } else { /* then, compare cs with BOM */
255 r = (strcmp(tok->encoding, cs) == 0);
256 if (!r)
257 PyErr_Format(PyExc_SyntaxError,
258 "encoding problem: %s with BOM", cs);
259 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262}
263
264/* See whether the file starts with a BOM. If it does,
265 invoke the set_readline function with the new encoding.
266 Return 1 on success, 0 on failure. */
267
268static int
269check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 void unget_char(int, struct tok_state *),
271 int set_readline(struct tok_state *, const char *),
272 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 int ch1, ch2, ch3;
275 ch1 = get_char(tok);
276 tok->decoding_state = STATE_RAW;
277 if (ch1 == EOF) {
278 return 1;
279 } else if (ch1 == 0xEF) {
280 ch2 = get_char(tok);
281 if (ch2 != 0xBB) {
282 unget_char(ch2, tok);
283 unget_char(ch1, tok);
284 return 1;
285 }
286 ch3 = get_char(tok);
287 if (ch3 != 0xBF) {
288 unget_char(ch3, tok);
289 unget_char(ch2, tok);
290 unget_char(ch1, tok);
291 return 1;
292 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000294 /* Disable support for UTF-16 BOMs until a decision
295 is made whether this needs to be supported. */
296 } else if (ch1 == 0xFE) {
297 ch2 = get_char(tok);
298 if (ch2 != 0xFF) {
299 unget_char(ch2, tok);
300 unget_char(ch1, tok);
301 return 1;
302 }
303 if (!set_readline(tok, "utf-16-be"))
304 return 0;
305 tok->decoding_state = STATE_NORMAL;
306 } else if (ch1 == 0xFF) {
307 ch2 = get_char(tok);
308 if (ch2 != 0xFE) {
309 unget_char(ch2, tok);
310 unget_char(ch1, tok);
311 return 1;
312 }
313 if (!set_readline(tok, "utf-16-le"))
314 return 0;
315 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000316#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 } else {
318 unget_char(ch1, tok);
319 return 1;
320 }
321 if (tok->encoding != NULL)
322 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700323 tok->encoding = new_string("utf-8", 5, tok);
324 if (!tok->encoding)
325 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 /* No need to set_readline: input is already utf-8 */
327 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328}
329
330/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000331 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000333 On entry, tok->decoding_buffer will be one of:
334 1) NULL: need to call tok->decoding_readline to get a new line
335 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000336 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000337 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 (in the s buffer) to copy entire contents of the line read
339 by tok->decoding_readline. tok->decoding_buffer has the overflow.
340 In this case, fp_readl is called in a loop (with an expanded buffer)
341 until the buffer ends with a '\n' (or until the end of the file is
342 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344
345static char *
346fp_readl(char *s, int size, struct tok_state *tok)
347{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 PyObject* bufobj;
349 const char *buf;
350 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 /* Ask for one less byte so we can terminate it */
353 assert(size > 0);
354 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 if (tok->decoding_buffer) {
357 bufobj = tok->decoding_buffer;
358 Py_INCREF(bufobj);
359 }
360 else
361 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100362 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000363 if (bufobj == NULL)
364 goto error;
365 }
366 if (PyUnicode_CheckExact(bufobj))
367 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200368 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000369 if (buf == NULL) {
370 goto error;
371 }
372 }
373 else
374 {
375 buf = PyByteArray_AsString(bufobj);
376 if (buf == NULL) {
377 goto error;
378 }
379 buflen = PyByteArray_GET_SIZE(bufobj);
380 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 Py_XDECREF(tok->decoding_buffer);
383 if (buflen > size) {
384 /* Too many chars, the rest goes into tok->decoding_buffer */
385 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
388 goto error;
389 buflen = size;
390 }
391 else
392 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 memcpy(s, buf, buflen);
395 s[buflen] = '\0';
396 if (buflen == 0) /* EOF */
397 s = NULL;
398 Py_DECREF(bufobj);
399 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000400
401error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000402 Py_XDECREF(bufobj);
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404}
405
406/* Set the readline function for TOK to a StreamReader's
407 readline function. The StreamReader is named ENC.
408
409 This function is called from check_bom and check_coding_spec.
410
411 ENC is usually identical to the future value of tok->encoding,
412 except for the (currently unsupported) case of UTF-16.
413
414 Return 1 on success, 0 on failure. */
415
416static int
417fp_setreadl(struct tok_state *tok, const char* enc)
418{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700419 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200420 _Py_IDENTIFIER(open);
421 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000422 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200423 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
Victor Stinner22a351a2010-10-14 12:04:34 +0000425 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200426 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100427 * position of tok->fp. If tok->fp was opened in text mode on Windows,
428 * its file position counts CRLF as one char and can't be directly mapped
429 * to the file offset for fd. Instead we step back one byte and read to
430 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200431 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100432 if (pos == -1 ||
433 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000434 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700435 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000436 }
437
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700438 io = PyImport_ImportModuleNoBlock("io");
439 if (io == NULL)
440 return 0;
441
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200442 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000443 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000445 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700446 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200448 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449 Py_DECREF(stream);
450 if (readline == NULL)
451 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300452 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700453
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100454 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100455 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 if (bufobj == NULL)
457 return 0;
458 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100459 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000460
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462}
463
464/* Fetch the next byte from TOK. */
465
466static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468}
469
470/* Unfetch the last byte back into TOK. */
471
472static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000476/* Check whether the characters at s start a valid
477 UTF-8 sequence. Return the number of characters forming
478 the sequence if yes, 0 if not. */
479static int valid_utf8(const unsigned char* s)
480{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 int expected = 0;
482 int length;
483 if (*s < 0x80)
484 /* single-byte code */
485 return 1;
486 if (*s < 0xc0)
487 /* following byte */
488 return 0;
489 if (*s < 0xE0)
490 expected = 1;
491 else if (*s < 0xF0)
492 expected = 2;
493 else if (*s < 0xF8)
494 expected = 3;
495 else
496 return 0;
497 length = expected + 1;
498 for (; expected; expected--)
499 if (s[expected] < 0x80 || s[expected] >= 0xC0)
500 return 0;
501 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000502}
503
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504/* Read a line of input from TOK. Determine encoding
505 if necessary. */
506
507static char *
508decoding_fgets(char *s, int size, struct tok_state *tok)
509{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 char *line = NULL;
511 int badchar = 0;
512 for (;;) {
513 if (tok->decoding_state == STATE_NORMAL) {
514 /* We already have a codec associated with
515 this input. */
516 line = fp_readl(s, size, tok);
517 break;
518 } else if (tok->decoding_state == STATE_RAW) {
519 /* We want a 'raw' read. */
520 line = Py_UniversalNewlineFgets(s, size,
521 tok->fp, NULL);
522 break;
523 } else {
524 /* We have not yet determined the encoding.
525 If an encoding is found, use the file-pointer
526 reader functions from now on. */
527 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
528 return error_ret(tok);
529 assert(tok->decoding_state != STATE_INIT);
530 }
531 }
532 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
533 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
534 return error_ret(tok);
535 }
536 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 /* The default encoding is UTF-8, so make sure we don't have any
538 non-UTF-8 sequences in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 int length;
542 for (c = (unsigned char *)line; *c; c += length)
543 if (!(length = valid_utf8(c))) {
544 badchar = *c;
545 break;
546 }
547 }
548 if (badchar) {
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200551 PyErr_Format(PyExc_SyntaxError,
552 "Non-UTF-8 code starting with '\\x%.2x' "
553 "in file %U on line %i, "
554 "but no encoding declared; "
555 "see http://python.org/dev/peps/pep-0263/ for details",
556 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 return error_ret(tok);
558 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562static int
563decoding_feof(struct tok_state *tok)
564{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000565 if (tok->decoding_state != STATE_NORMAL) {
566 return feof(tok->fp);
567 } else {
568 PyObject* buf = tok->decoding_buffer;
569 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100570 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 if (buf == NULL) {
572 error_ret(tok);
573 return 1;
574 } else {
575 tok->decoding_buffer = buf;
576 }
577 }
578 return PyObject_Length(buf) == 0;
579 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580}
581
582/* Fetch a byte from TOK, using the string buffer. */
583
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000584static int
585buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587}
588
589/* Unfetch a byte from TOK, using the string buffer. */
590
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000591static void
592buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000593 tok->str--;
594 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595}
596
597/* Set the readline function for TOK to ENC. For the string-based
598 tokenizer, this means to just record the encoding. */
599
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000600static int
601buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 tok->enc = enc;
603 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604}
605
606/* Return a UTF-8 encoding Python string object from the
607 C byte string STR, which is encoded with ENC. */
608
609static PyObject *
610translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000611 PyObject *utf8;
612 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
613 if (buf == NULL)
614 return NULL;
615 utf8 = PyUnicode_AsUTF8String(buf);
616 Py_DECREF(buf);
617 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000620
621static char *
622translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200623 int skip_next_lf = 0;
624 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 char *buf, *current;
626 char c = '\0';
627 buf = PyMem_MALLOC(needed_length);
628 if (buf == NULL) {
629 tok->done = E_NOMEM;
630 return NULL;
631 }
632 for (current = buf; *s; s++, current++) {
633 c = *s;
634 if (skip_next_lf) {
635 skip_next_lf = 0;
636 if (c == '\n') {
637 c = *++s;
638 if (!c)
639 break;
640 }
641 }
642 if (c == '\r') {
643 skip_next_lf = 1;
644 c = '\n';
645 }
646 *current = c;
647 }
648 /* If this is exec input, add a newline to the end of the string if
649 there isn't one already. */
650 if (exec_input && c != '\n') {
651 *current = '\n';
652 current++;
653 }
654 *current = '\0';
655 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000656 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 /* should never fail */
Pablo Galindocb90c892019-03-19 17:17:58 +0000658 char* result = PyMem_REALLOC(buf, final_length);
659 if (result == NULL) {
660 PyMem_FREE(buf);
661 }
662 buf = result;
663 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000665}
666
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667/* Decode a byte string STR for use as the buffer of TOK.
668 Look for encoding declarations inside STR, and record them
669 inside TOK. */
670
Andy Lester384f3c52020-02-27 20:44:52 -0600671static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000672decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000673{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600675 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 const char *s;
677 const char *newl[2] = {NULL, NULL};
678 int lineno = 0;
679 tok->input = str = translate_newlines(input, single, tok);
680 if (str == NULL)
681 return NULL;
682 tok->enc = NULL;
683 tok->str = str;
684 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
685 return error_ret(tok);
686 str = tok->str; /* string after BOM if any */
687 assert(str);
688 if (tok->enc != NULL) {
689 utf8 = translate_into_utf8(str, tok->enc);
690 if (utf8 == NULL)
691 return error_ret(tok);
692 str = PyBytes_AsString(utf8);
693 }
694 for (s = str;; s++) {
695 if (*s == '\0') break;
696 else if (*s == '\n') {
697 assert(lineno < 2);
698 newl[lineno] = s;
699 lineno++;
700 if (lineno == 2) break;
701 }
702 }
703 tok->enc = NULL;
704 /* need to check line 1 and 2 separately since check_coding_spec
705 assumes a single line as input */
706 if (newl[0]) {
707 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200709 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711 tok, buf_setreadl))
712 return error_ret(tok);
713 }
714 }
715 if (tok->enc != NULL) {
716 assert(utf8 == NULL);
717 utf8 = translate_into_utf8(str, tok->enc);
718 if (utf8 == NULL)
719 return error_ret(tok);
720 str = PyBytes_AS_STRING(utf8);
721 }
722 assert(tok->decoding_buffer == NULL);
723 tok->decoding_buffer = utf8; /* CAUTION */
724 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000725}
726
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727/* Set up tokenizer for string */
728
729struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000730PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600733 char *decoded;
734
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 if (tok == NULL)
736 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600737 decoded = decode_str(str, exec_input, tok);
738 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyTokenizer_Free(tok);
740 return NULL;
741 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000742
Andy Lester384f3c52020-02-27 20:44:52 -0600743 tok->buf = tok->cur = tok->inp = decoded;
744 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746}
747
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000749PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000750{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600752 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 if (tok == NULL)
754 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600755 tok->input = translated = translate_newlines(str, exec_input, tok);
756 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 PyTokenizer_Free(tok);
758 return NULL;
759 }
760 tok->decoding_state = STATE_RAW;
761 tok->read_coding_spec = 1;
762 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600763 tok->str = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 tok->encoding = (char *)PyMem_MALLOC(6);
765 if (!tok->encoding) {
766 PyTokenizer_Free(tok);
767 return NULL;
768 }
769 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000770
Andy Lester384f3c52020-02-27 20:44:52 -0600771 tok->buf = tok->cur = tok->inp = translated;
772 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000774}
775
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000776/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777
778struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300779PyTokenizer_FromFile(FILE *fp, const char* enc,
780 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 struct tok_state *tok = tok_new();
783 if (tok == NULL)
784 return NULL;
785 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
786 PyTokenizer_Free(tok);
787 return NULL;
788 }
789 tok->cur = tok->inp = tok->buf;
790 tok->end = tok->buf + BUFSIZ;
791 tok->fp = fp;
792 tok->prompt = ps1;
793 tok->nextprompt = ps2;
794 if (enc != NULL) {
795 /* Must copy encoding declaration since it
796 gets copied into the parse tree. */
797 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
798 if (!tok->encoding) {
799 PyTokenizer_Free(tok);
800 return NULL;
801 }
802 strcpy(tok->encoding, enc);
803 tok->decoding_state = STATE_NORMAL;
804 }
805 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806}
807
808
809/* Free a tok_state structure */
810
811void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000812PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 if (tok->encoding != NULL)
815 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 Py_XDECREF(tok->decoding_readline);
817 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200818 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 if (tok->fp != NULL && tok->buf != NULL)
820 PyMem_FREE(tok->buf);
821 if (tok->input)
Andy Lester384f3c52020-02-27 20:44:52 -0600822 PyMem_FREE(tok->input);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824}
825
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826/* Get next char, updating state; error code goes into tok->done */
827
828static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200829tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000830{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 for (;;) {
832 if (tok->cur != tok->inp) {
833 return Py_CHARMASK(*tok->cur++); /* Fast path */
834 }
835 if (tok->done != E_OK)
836 return EOF;
837 if (tok->fp == NULL) {
838 char *end = strchr(tok->inp, '\n');
839 if (end != NULL)
840 end++;
841 else {
842 end = strchr(tok->inp, '\0');
843 if (end == tok->inp) {
844 tok->done = E_EOF;
845 return EOF;
846 }
847 }
848 if (tok->start == NULL)
849 tok->buf = tok->cur;
850 tok->line_start = tok->cur;
851 tok->lineno++;
852 tok->inp = end;
853 return Py_CHARMASK(*tok->cur++);
854 }
855 if (tok->prompt != NULL) {
856 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000857 if (newtok != NULL) {
858 char *translated = translate_newlines(newtok, 0, tok);
859 PyMem_FREE(newtok);
860 if (translated == NULL)
861 return EOF;
862 newtok = translated;
863 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 if (tok->encoding && newtok && *newtok) {
865 /* Recode to UTF-8 */
866 Py_ssize_t buflen;
867 const char* buf;
868 PyObject *u = translate_into_utf8(newtok, tok->encoding);
869 PyMem_FREE(newtok);
870 if (!u) {
871 tok->done = E_DECODE;
872 return EOF;
873 }
874 buflen = PyBytes_GET_SIZE(u);
875 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700877 if (newtok == NULL) {
878 Py_DECREF(u);
879 tok->done = E_NOMEM;
880 return EOF;
881 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 strcpy(newtok, buf);
883 Py_DECREF(u);
884 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 if (tok->nextprompt != NULL)
886 tok->prompt = tok->nextprompt;
887 if (newtok == NULL)
888 tok->done = E_INTR;
889 else if (*newtok == '\0') {
890 PyMem_FREE(newtok);
891 tok->done = E_EOF;
892 }
893 else if (tok->start != NULL) {
894 size_t start = tok->start - tok->buf;
895 size_t oldlen = tok->cur - tok->buf;
896 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000897 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 char *buf = tok->buf;
899 buf = (char *)PyMem_REALLOC(buf, newlen+1);
900 tok->lineno++;
901 if (buf == NULL) {
902 PyMem_FREE(tok->buf);
903 tok->buf = NULL;
904 PyMem_FREE(newtok);
905 tok->done = E_NOMEM;
906 return EOF;
907 }
908 tok->buf = buf;
909 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000910 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 tok->line_start = tok->cur;
912 strcpy(tok->buf + oldlen, newtok);
913 PyMem_FREE(newtok);
914 tok->inp = tok->buf + newlen;
915 tok->end = tok->inp + 1;
916 tok->start = tok->buf + start;
917 }
918 else {
919 tok->lineno++;
920 if (tok->buf != NULL)
921 PyMem_FREE(tok->buf);
922 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 tok->cur = tok->buf;
924 tok->line_start = tok->buf;
925 tok->inp = strchr(tok->buf, '\0');
926 tok->end = tok->inp + 1;
927 }
928 }
929 else {
930 int done = 0;
931 Py_ssize_t cur = 0;
932 char *pt;
933 if (tok->start == NULL) {
934 if (tok->buf == NULL) {
935 tok->buf = (char *)
936 PyMem_MALLOC(BUFSIZ);
937 if (tok->buf == NULL) {
938 tok->done = E_NOMEM;
939 return EOF;
940 }
941 tok->end = tok->buf + BUFSIZ;
942 }
943 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
944 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200945 if (!tok->decoding_erred)
946 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000947 done = 1;
948 }
949 else {
950 tok->done = E_OK;
951 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700952 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 }
954 }
955 else {
956 cur = tok->cur - tok->buf;
957 if (decoding_feof(tok)) {
958 tok->done = E_EOF;
959 done = 1;
960 }
961 else
962 tok->done = E_OK;
963 }
964 tok->lineno++;
965 /* Read until '\n' or EOF */
966 while (!done) {
967 Py_ssize_t curstart = tok->start == NULL ? -1 :
968 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700969 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 Py_ssize_t curvalid = tok->inp - tok->buf;
971 Py_ssize_t newsize = curvalid + BUFSIZ;
972 char *newbuf = tok->buf;
973 newbuf = (char *)PyMem_REALLOC(newbuf,
974 newsize);
975 if (newbuf == NULL) {
976 tok->done = E_NOMEM;
977 tok->cur = tok->inp;
978 return EOF;
979 }
980 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200981 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700982 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200983 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000984 tok->inp = tok->buf + curvalid;
985 tok->end = tok->buf + newsize;
986 tok->start = curstart < 0 ? NULL :
987 tok->buf + curstart;
988 if (decoding_fgets(tok->inp,
989 (int)(tok->end - tok->inp),
990 tok) == NULL) {
991 /* Break out early on decoding
992 errors, as tok->buf will be NULL
993 */
994 if (tok->decoding_erred)
995 return EOF;
996 /* Last line does not end in \n,
997 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -0700998 if (tok->inp[-1] != '\n')
999 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 }
1001 tok->inp = strchr(tok->inp, '\0');
1002 done = tok->inp[-1] == '\n';
1003 }
1004 if (tok->buf != NULL) {
1005 tok->cur = tok->buf + cur;
1006 tok->line_start = tok->cur;
1007 /* replace "\r\n" with "\n" */
1008 /* For Mac leave the \r, giving a syntax error */
1009 pt = tok->inp - 2;
1010 if (pt >= tok->buf && *pt == '\r') {
1011 *pt++ = '\n';
1012 *pt = '\0';
1013 tok->inp = pt;
1014 }
1015 }
1016 }
1017 if (tok->done != E_OK) {
1018 if (tok->prompt != NULL)
1019 PySys_WriteStderr("\n");
1020 tok->cur = tok->inp;
1021 return EOF;
1022 }
1023 }
1024 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025}
1026
1027
1028/* Back-up one character */
1029
1030static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001031tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001034 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001035 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001036 }
1037 if (*tok->cur != c) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 *tok->cur = c;
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001039 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041}
1042
1043
Guido van Rossum926f13a1998-04-09 21:38:06 +00001044static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001045syntaxerror(struct tok_state *tok, const char *format, ...)
1046{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001047 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001048 va_list vargs;
1049#ifdef HAVE_STDARG_PROTOTYPES
1050 va_start(vargs, format);
1051#else
1052 va_start(vargs);
1053#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001054 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001055 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001056 if (!errmsg) {
1057 goto error;
1058 }
1059
1060 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1061 "replace");
1062 if (!errtext) {
1063 goto error;
1064 }
1065 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1066 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1067 if (line_len != tok->cur - tok->line_start) {
1068 Py_DECREF(errtext);
1069 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1070 "replace");
1071 }
1072 if (!errtext) {
1073 goto error;
1074 }
1075
1076 args = Py_BuildValue("(O(OiiN))", errmsg,
1077 tok->filename, tok->lineno, offset, errtext);
1078 if (args) {
1079 PyErr_SetObject(PyExc_SyntaxError, args);
1080 Py_DECREF(args);
1081 }
1082
1083error:
1084 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001085 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001086 return ERRORTOKEN;
1087}
1088
1089static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001090indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001091{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001092 tok->done = E_TABSPACE;
1093 tok->cur = tok->inp;
1094 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095}
1096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097/* Verify that the identifier follows PEP 3131.
1098 All identifier strings are guaranteed to be "ready" unicode objects.
1099 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001100static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001101verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001102{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 PyObject *s;
1104 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001105 if (tok->decoding_erred)
1106 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001108 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1110 PyErr_Clear();
1111 tok->done = E_IDENTIFIER;
1112 } else {
1113 tok->done = E_ERROR;
1114 }
1115 return 0;
1116 }
1117 result = PyUnicode_IsIdentifier(s);
1118 Py_DECREF(s);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001119 if (result == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 tok->done = E_IDENTIFIER;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001121 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001123}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001124
Brett Cannona721aba2016-09-09 14:57:09 -07001125static int
1126tok_decimal_tail(struct tok_state *tok)
1127{
1128 int c;
1129
1130 while (1) {
1131 do {
1132 c = tok_nextc(tok);
1133 } while (isdigit(c));
1134 if (c != '_') {
1135 break;
1136 }
1137 c = tok_nextc(tok);
1138 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001139 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001140 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001141 return 0;
1142 }
1143 }
1144 return c;
1145}
1146
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001147/* Get next token, after space stripping etc. */
1148
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001149static int
Andy Lester384f3c52020-02-27 20:44:52 -06001150tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001151{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001152 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001156 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 tok->start = NULL;
1158 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001159
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 /* Get indentation level */
1161 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001162 int col = 0;
1163 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 tok->atbol = 0;
1165 for (;;) {
1166 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001167 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001169 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001171 col = (col / tok->tabsize + 1) * tok->tabsize;
1172 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 }
Brett Cannona721aba2016-09-09 14:57:09 -07001174 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001176 }
1177 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001179 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 }
1181 tok_backup(tok, c);
1182 if (c == '#' || c == '\n') {
1183 /* Lines with only whitespace and/or comments
1184 shouldn't affect the indentation and are
1185 not passed to the parser as NEWLINE tokens,
1186 except *totally* empty lines in interactive
1187 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001188 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001190 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001191 else if (tok->prompt != NULL && tok->lineno == 1) {
1192 /* In interactive mode, if the first line contains
1193 only spaces and/or a comment, let it through. */
1194 blankline = 0;
1195 col = altcol = 0;
1196 }
Brett Cannona721aba2016-09-09 14:57:09 -07001197 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001199 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 /* We can't jump back right here since we still
1201 may need to skip to the end of a comment */
1202 }
1203 if (!blankline && tok->level == 0) {
1204 if (col == tok->indstack[tok->indent]) {
1205 /* No change */
1206 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001207 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 }
1209 }
1210 else if (col > tok->indstack[tok->indent]) {
1211 /* Indent -- always one */
1212 if (tok->indent+1 >= MAXINDENT) {
1213 tok->done = E_TOODEEP;
1214 tok->cur = tok->inp;
1215 return ERRORTOKEN;
1216 }
1217 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001218 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 }
1220 tok->pendin++;
1221 tok->indstack[++tok->indent] = col;
1222 tok->altindstack[tok->indent] = altcol;
1223 }
1224 else /* col < tok->indstack[tok->indent] */ {
1225 /* Dedent -- any number, must be consistent */
1226 while (tok->indent > 0 &&
1227 col < tok->indstack[tok->indent]) {
1228 tok->pendin--;
1229 tok->indent--;
1230 }
1231 if (col != tok->indstack[tok->indent]) {
1232 tok->done = E_DEDENT;
1233 tok->cur = tok->inp;
1234 return ERRORTOKEN;
1235 }
1236 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001237 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 }
1239 }
1240 }
1241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001242
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 /* Return pending indents/dedents */
1246 if (tok->pendin != 0) {
1247 if (tok->pendin < 0) {
1248 tok->pendin++;
1249 return DEDENT;
1250 }
1251 else {
1252 tok->pendin--;
1253 return INDENT;
1254 }
1255 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001256
Guido van Rossum495da292019-03-07 12:38:08 -08001257 /* Peek ahead at the next character */
1258 c = tok_nextc(tok);
1259 tok_backup(tok, c);
1260 /* Check if we are closing an async function */
1261 if (tok->async_def
1262 && !blankline
1263 /* Due to some implementation artifacts of type comments,
1264 * a TYPE_COMMENT at the start of a function won't set an
1265 * indentation level and it will produce a NEWLINE after it.
1266 * To avoid spuriously ending an async function due to this,
1267 * wait until we have some non-newline char in front of us. */
1268 && c != '\n'
1269 && tok->level == 0
1270 /* There was a NEWLINE after ASYNC DEF,
1271 so we're past the signature. */
1272 && tok->async_def_nl
1273 /* Current indentation level is less than where
1274 the async function was defined */
1275 && tok->async_def_indent >= tok->indent)
1276 {
1277 tok->async_def = 0;
1278 tok->async_def_indent = 0;
1279 tok->async_def_nl = 0;
1280 }
1281
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 tok->start = NULL;
1284 /* Skip spaces */
1285 do {
1286 c = tok_nextc(tok);
1287 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001288
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 /* Set start of current token */
1290 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001291
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001292 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001293 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001294 const char *prefix, *p, *type_start;
1295
Brett Cannona721aba2016-09-09 14:57:09 -07001296 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001298 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001299
1300 if (tok->type_comments) {
1301 p = tok->start;
1302 prefix = type_comment_prefix;
1303 while (*prefix && p < tok->cur) {
1304 if (*prefix == ' ') {
1305 while (*p == ' ' || *p == '\t') {
1306 p++;
1307 }
1308 } else if (*prefix == *p) {
1309 p++;
1310 } else {
1311 break;
1312 }
1313
1314 prefix++;
1315 }
1316
1317 /* This is a type comment if we matched all of type_comment_prefix. */
1318 if (!*prefix) {
1319 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001320 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001321 tok_backup(tok, c); /* don't eat the newline or EOF */
1322
1323 type_start = p;
1324
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001325 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001326 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001327 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001328 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001329 && !(tok->cur > ignore_end
1330 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001331
1332 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001333 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001334 *p_end = tok->cur;
1335
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001336 /* If this type ignore is the only thing on the line, consume the newline also. */
1337 if (blankline) {
1338 tok_nextc(tok);
1339 tok->atbol = 1;
1340 }
1341 return TYPE_IGNORE;
1342 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001343 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001344 *p_end = tok->cur;
1345 return TYPE_COMMENT;
1346 }
1347 }
1348 }
Brett Cannona721aba2016-09-09 14:57:09 -07001349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001350
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 /* Check for EOF and errors now */
1352 if (c == EOF) {
1353 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1354 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 /* Identifier (most frequent token!) */
1357 nonascii = 0;
1358 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001359 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001360 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001361 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001362 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001363 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001364 /* Since this is a backwards compatibility support literal we don't
1365 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001366 else if (!(saw_b || saw_u || saw_r || saw_f)
1367 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001368 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001369 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001370 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001371 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001372 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001373 }
1374 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001375 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001376 }
1377 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001378 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001379 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001381 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001383 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 }
1385 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001386 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001388 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 c = tok_nextc(tok);
1390 }
1391 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001392 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001394 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001395
1396 *p_start = tok->start;
1397 *p_end = tok->cur;
1398
Lysandros Nikolaou41d5b942020-04-12 21:21:00 +03001399 if (c == '"' || c == '\'') {
1400 tok->done = E_BADPREFIX;
1401 return ERRORTOKEN;
1402 }
Guido van Rossum495da292019-03-07 12:38:08 -08001403 /* async/await parsing block. */
1404 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1405 /* May be an 'async' or 'await' token. For Python 3.7 or
1406 later we recognize them unconditionally. For Python
1407 3.5 or 3.6 we recognize 'async' in front of 'def', and
1408 either one inside of 'async def'. (Technically we
1409 shouldn't recognize these at all for 3.4 or earlier,
1410 but there's no *valid* Python 3.4 code that would be
1411 rejected, and async functions will be rejected in a
1412 later phase.) */
1413 if (!tok->async_hacks || tok->async_def) {
1414 /* Always recognize the keywords. */
1415 if (memcmp(tok->start, "async", 5) == 0) {
1416 return ASYNC;
1417 }
1418 if (memcmp(tok->start, "await", 5) == 0) {
1419 return AWAIT;
1420 }
1421 }
1422 else if (memcmp(tok->start, "async", 5) == 0) {
1423 /* The current token is 'async'.
1424 Look ahead one token to see if that is 'def'. */
1425
1426 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001427 const char *ahead_tok_start = NULL;
1428 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001429 int ahead_tok_kind;
1430
1431 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1432 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1433 &ahead_tok_end);
1434
1435 if (ahead_tok_kind == NAME
1436 && ahead_tok.cur - ahead_tok.start == 3
1437 && memcmp(ahead_tok.start, "def", 3) == 0)
1438 {
1439 /* The next token is going to be 'def', so instead of
1440 returning a plain NAME token, return ASYNC. */
1441 tok->async_def_indent = tok->indent;
1442 tok->async_def = 1;
1443 return ASYNC;
1444 }
1445 }
1446 }
1447
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001448 return NAME;
1449 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 /* Newline */
1452 if (c == '\n') {
1453 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001454 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001455 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001456 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001457 *p_start = tok->start;
1458 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1459 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001460 if (tok->async_def) {
1461 /* We're somewhere inside an 'async def' function, and
1462 we've encountered a NEWLINE after its signature. */
1463 tok->async_def_nl = 1;
1464 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 return NEWLINE;
1466 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001468 /* Period or number starting with period? */
1469 if (c == '.') {
1470 c = tok_nextc(tok);
1471 if (isdigit(c)) {
1472 goto fraction;
1473 } else if (c == '.') {
1474 c = tok_nextc(tok);
1475 if (c == '.') {
1476 *p_start = tok->start;
1477 *p_end = tok->cur;
1478 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001479 }
1480 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 tok_backup(tok, c);
1482 }
1483 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001484 }
1485 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 tok_backup(tok, c);
1487 }
1488 *p_start = tok->start;
1489 *p_end = tok->cur;
1490 return DOT;
1491 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Number */
1494 if (isdigit(c)) {
1495 if (c == '0') {
1496 /* Hex, octal or binary -- maybe. */
1497 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001498 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 /* Hex */
1500 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001502 if (c == '_') {
1503 c = tok_nextc(tok);
1504 }
1505 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001506 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001507 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001508 }
1509 do {
1510 c = tok_nextc(tok);
1511 } while (isxdigit(c));
1512 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 }
1514 else if (c == 'o' || c == 'O') {
1515 /* Octal */
1516 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001518 if (c == '_') {
1519 c = tok_nextc(tok);
1520 }
1521 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001522 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001523 if (isdigit(c)) {
1524 return syntaxerror(tok,
1525 "invalid digit '%c' in octal literal", c);
1526 }
1527 else {
1528 return syntaxerror(tok, "invalid octal literal");
1529 }
Brett Cannona721aba2016-09-09 14:57:09 -07001530 }
1531 do {
1532 c = tok_nextc(tok);
1533 } while ('0' <= c && c < '8');
1534 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001535 if (isdigit(c)) {
1536 return syntaxerror(tok,
1537 "invalid digit '%c' in octal literal", c);
1538 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 }
1540 else if (c == 'b' || c == 'B') {
1541 /* Binary */
1542 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001544 if (c == '_') {
1545 c = tok_nextc(tok);
1546 }
1547 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001548 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001549 if (isdigit(c)) {
1550 return syntaxerror(tok,
1551 "invalid digit '%c' in binary literal", c);
1552 }
1553 else {
1554 return syntaxerror(tok, "invalid binary literal");
1555 }
Brett Cannona721aba2016-09-09 14:57:09 -07001556 }
1557 do {
1558 c = tok_nextc(tok);
1559 } while (c == '0' || c == '1');
1560 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001561 if (isdigit(c)) {
1562 return syntaxerror(tok,
1563 "invalid digit '%c' in binary literal", c);
1564 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001565 }
1566 else {
1567 int nonzero = 0;
1568 /* maybe old-style octal; c is first char of it */
1569 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001570 while (1) {
1571 if (c == '_') {
1572 c = tok_nextc(tok);
1573 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001574 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001575 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001576 }
1577 }
1578 if (c != '0') {
1579 break;
1580 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001581 c = tok_nextc(tok);
1582 }
Brett Cannona721aba2016-09-09 14:57:09 -07001583 if (isdigit(c)) {
1584 nonzero = 1;
1585 c = tok_decimal_tail(tok);
1586 if (c == 0) {
1587 return ERRORTOKEN;
1588 }
1589 }
1590 if (c == '.') {
1591 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001593 }
1594 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001596 }
1597 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001599 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001601 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001602 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001603 return syntaxerror(tok,
1604 "leading zeros in decimal integer "
1605 "literals are not permitted; "
1606 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 }
1608 }
1609 }
1610 else {
1611 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001612 c = tok_decimal_tail(tok);
1613 if (c == 0) {
1614 return ERRORTOKEN;
1615 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001616 {
1617 /* Accept floating point numbers. */
1618 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001619 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 fraction:
1621 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001622 if (isdigit(c)) {
1623 c = tok_decimal_tail(tok);
1624 if (c == 0) {
1625 return ERRORTOKEN;
1626 }
1627 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 }
1629 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001630 int e;
1631 exponent:
1632 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 /* Exponent part */
1634 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001635 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001637 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001638 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001639 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001640 }
1641 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001643 tok_backup(tok, e);
1644 *p_start = tok->start;
1645 *p_end = tok->cur;
1646 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 }
Brett Cannona721aba2016-09-09 14:57:09 -07001648 c = tok_decimal_tail(tok);
1649 if (c == 0) {
1650 return ERRORTOKEN;
1651 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001652 }
Brett Cannona721aba2016-09-09 14:57:09 -07001653 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Imaginary part */
1655 imaginary:
1656 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001657 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 }
1659 }
1660 tok_backup(tok, c);
1661 *p_start = tok->start;
1662 *p_end = tok->cur;
1663 return NUMBER;
1664 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001665
1666 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001667 /* String */
1668 if (c == '\'' || c == '"') {
1669 int quote = c;
1670 int quote_size = 1; /* 1 or 3 */
1671 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001672
Anthony Sottile995d9b92019-01-12 20:05:13 -08001673 /* Nodes of type STRING, especially multi line strings
1674 must be handled differently in order to get both
1675 the starting line number and the column offset right.
1676 (cf. issue 16806) */
1677 tok->first_lineno = tok->lineno;
1678 tok->multi_line_start = tok->line_start;
1679
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680 /* Find the quote size and start of string */
1681 c = tok_nextc(tok);
1682 if (c == quote) {
1683 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001684 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001686 }
1687 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001689 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690 }
Brett Cannona721aba2016-09-09 14:57:09 -07001691 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001693 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001694
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 /* Get rest of string */
1696 while (end_quote_size != quote_size) {
1697 c = tok_nextc(tok);
1698 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001699 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001701 }
1702 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001703 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001704 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 tok->cur = tok->inp;
1706 return ERRORTOKEN;
1707 }
1708 if (quote_size == 1 && c == '\n') {
1709 tok->done = E_EOLS;
1710 tok->cur = tok->inp;
1711 return ERRORTOKEN;
1712 }
Brett Cannona721aba2016-09-09 14:57:09 -07001713 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001715 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001716 else {
1717 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001718 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001719 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001720 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 }
1722 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001723
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 *p_start = tok->start;
1725 *p_end = tok->cur;
1726 return STRING;
1727 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001728
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 /* Line continuation */
1730 if (c == '\\') {
1731 c = tok_nextc(tok);
1732 if (c != '\n') {
1733 tok->done = E_LINECONT;
1734 tok->cur = tok->inp;
1735 return ERRORTOKEN;
1736 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001737 c = tok_nextc(tok);
1738 if (c == EOF) {
1739 tok->done = E_EOF;
1740 tok->cur = tok->inp;
1741 return ERRORTOKEN;
1742 } else {
1743 tok_backup(tok, c);
1744 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 tok->cont_line = 1;
1746 goto again; /* Read next line */
1747 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001748
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 /* Check for two-character token */
1750 {
1751 int c2 = tok_nextc(tok);
1752 int token = PyToken_TwoChars(c, c2);
1753 if (token != OP) {
1754 int c3 = tok_nextc(tok);
1755 int token3 = PyToken_ThreeChars(c, c2, c3);
1756 if (token3 != OP) {
1757 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001758 }
1759 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001760 tok_backup(tok, c3);
1761 }
1762 *p_start = tok->start;
1763 *p_end = tok->cur;
1764 return token;
1765 }
1766 tok_backup(tok, c2);
1767 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001768
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 /* Keep track of parentheses nesting level */
1770 switch (c) {
1771 case '(':
1772 case '[':
1773 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001774 if (tok->level >= MAXLEVEL) {
1775 return syntaxerror(tok, "too many nested parentheses");
1776 }
1777 tok->parenstack[tok->level] = c;
1778 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 tok->level++;
1780 break;
1781 case ')':
1782 case ']':
1783 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001784 if (!tok->level) {
1785 return syntaxerror(tok, "unmatched '%c'", c);
1786 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001787 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001788 int opening = tok->parenstack[tok->level];
1789 if (!((opening == '(' && c == ')') ||
1790 (opening == '[' && c == ']') ||
1791 (opening == '{' && c == '}')))
1792 {
1793 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1794 return syntaxerror(tok,
1795 "closing parenthesis '%c' does not match "
1796 "opening parenthesis '%c' on line %d",
1797 c, opening, tok->parenlinenostack[tok->level]);
1798 }
1799 else {
1800 return syntaxerror(tok,
1801 "closing parenthesis '%c' does not match "
1802 "opening parenthesis '%c'",
1803 c, opening);
1804 }
1805 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 break;
1807 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001808
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001809 /* Punctuation character */
1810 *p_start = tok->start;
1811 *p_end = tok->cur;
1812 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001813}
1814
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001815int
Andy Lester384f3c52020-02-27 20:44:52 -06001816PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001817{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001818 int result = tok_get(tok, p_start, p_end);
1819 if (tok->decoding_erred) {
1820 result = ERRORTOKEN;
1821 tok->done = E_DECODE;
1822 }
1823 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001824}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001825
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001826/* Get the encoding of a Python file. Check for the coding cookie and check if
1827 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001828
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001829 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1830 encoding in the first or second line of the file (in which case the encoding
1831 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001832
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001833 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1834 by the caller. */
1835
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001836char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001837PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001838{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 struct tok_state *tok;
1840 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001841 const char *p_start = NULL;
1842 const char *p_end = NULL;
1843 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001844
Victor Stinnerdaf45552013-08-28 00:53:59 +02001845 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001846 if (fd < 0) {
1847 return NULL;
1848 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001849
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 fp = fdopen(fd, "r");
1851 if (fp == NULL) {
1852 return NULL;
1853 }
1854 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1855 if (tok == NULL) {
1856 fclose(fp);
1857 return NULL;
1858 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001859 if (filename != NULL) {
1860 Py_INCREF(filename);
1861 tok->filename = filename;
1862 }
1863 else {
1864 tok->filename = PyUnicode_FromString("<string>");
1865 if (tok->filename == NULL) {
1866 fclose(fp);
1867 PyTokenizer_Free(tok);
1868 return encoding;
1869 }
1870 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001871 while (tok->lineno < 2 && tok->done == E_OK) {
1872 PyTokenizer_Get(tok, &p_start, &p_end);
1873 }
1874 fclose(fp);
1875 if (tok->encoding) {
1876 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1877 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301878 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001879 }
1880 PyTokenizer_Free(tok);
1881 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001882}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001883
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001884char *
1885PyTokenizer_FindEncoding(int fd)
1886{
1887 return PyTokenizer_FindEncodingFilename(fd, NULL);
1888}
1889
Guido van Rossum408027e1996-12-30 16:17:54 +00001890#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001891
1892void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001893tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001894{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001895 printf("%s", _PyParser_TokenNames[type]);
1896 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1897 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001898}
1899
1900#endif