blob: 0f2b6af5e50adfac9fdcf063a944a99923d1959a [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Serhiy Storchakac6792272013-10-19 21:03:34 +030035extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000036/* Return malloc'ed string including trailing \n;
37 empty malloc'ed string for EOF;
38 NULL if interrupted */
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000044static struct tok_state *tok_new(void);
45static int tok_nextc(struct tok_state *tok);
46static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000047
Brett Cannond5ec98c2007-10-20 02:54:14 +000048
Guido van Rossumdcfcd142019-01-31 03:40:27 -080049/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50 tokenizing. */
51static const char* type_comment_prefix = "# type: ";
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Create and initialize a new tok_state structure */
54
55static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000056tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59 sizeof(struct tok_state));
60 if (tok == NULL)
61 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060062 tok->buf = tok->cur = tok->inp = NULL;
63 tok->start = NULL;
64 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065 tok->done = E_OK;
66 tok->fp = NULL;
67 tok->input = NULL;
68 tok->tabsize = TABSIZE;
69 tok->indent = 0;
70 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000072 tok->atbol = 1;
73 tok->pendin = 0;
74 tok->prompt = tok->nextprompt = NULL;
75 tok->lineno = 0;
76 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->altindstack[0] = 0;
78 tok->decoding_state = STATE_INIT;
79 tok->decoding_erred = 0;
80 tok->read_coding_spec = 0;
81 tok->enc = NULL;
82 tok->encoding = NULL;
83 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020084 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000085 tok->decoding_readline = NULL;
86 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080087 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030088
Guido van Rossum495da292019-03-07 12:38:08 -080089 tok->async_hacks = 0;
90 tok->async_def = 0;
91 tok->async_def_indent = 0;
92 tok->async_def_nl = 0;
93
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000094 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095}
96
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000097static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000099{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 if (!result) {
102 tok->done = E_NOMEM;
103 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700105 memcpy(result, s, len);
106 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000108}
109
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000110static char *
111error_ret(struct tok_state *tok) /* XXX */
112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 tok->decoding_erred = 1;
114 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
115 PyMem_FREE(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600116 tok->buf = tok->cur = tok->inp = NULL;
117 tok->start = NULL;
118 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200119 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121}
122
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000123
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200124static const char *
125get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 char buf[13];
128 int i;
129 for (i = 0; i < 12; i++) {
130 int c = s[i];
131 if (c == '\0')
132 break;
133 else if (c == '_')
134 buf[i] = '-';
135 else
136 buf[i] = tolower(c);
137 }
138 buf[i] = '\0';
139 if (strcmp(buf, "utf-8") == 0 ||
140 strncmp(buf, "utf-8-", 6) == 0)
141 return "utf-8";
142 else if (strcmp(buf, "latin-1") == 0 ||
143 strcmp(buf, "iso-8859-1") == 0 ||
144 strcmp(buf, "iso-latin-1") == 0 ||
145 strncmp(buf, "latin-1-", 8) == 0 ||
146 strncmp(buf, "iso-8859-1-", 11) == 0 ||
147 strncmp(buf, "iso-latin-1-", 12) == 0)
148 return "iso-8859-1";
149 else
150 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000151}
152
153/* Return the coding spec in S, or NULL if none is found. */
154
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155static int
156get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000157{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 /* Coding spec must be in a comment, and that comment must be
161 * the only statement on the source code line. */
162 for (i = 0; i < size - 6; i++) {
163 if (s[i] == '#')
164 break;
165 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 }
168 for (; i < size - 6; i++) { /* XXX inefficient search */
169 const char* t = s + i;
170 if (strncmp(t, "coding", 6) == 0) {
171 const char* begin = NULL;
172 t += 6;
173 if (t[0] != ':' && t[0] != '=')
174 continue;
175 do {
176 t++;
177 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 begin = t;
180 while (Py_ISALNUM(t[0]) ||
181 t[0] == '-' || t[0] == '_' || t[0] == '.')
182 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700185 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200186 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 if (!r)
188 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700189 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 if (r != q) {
191 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 r = new_string(q, strlen(q), tok);
193 if (!r)
194 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700196 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200197 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
199 }
200 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700201 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
204/* Check whether the line contains a coding spec. If it does,
205 invoke the set_readline function for the new encoding.
206 This function receives the tok_state and the new encoding.
207 Return 1 on success, 0 on failure. */
208
209static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000212{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000215
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200220 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700221 if (!get_coding_spec(line, &cs, size, tok))
222 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200223 if (!cs) {
224 Py_ssize_t i;
225 for (i = 0; i < size; i++) {
226 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
227 break;
228 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
229 /* Stop checking coding spec after a line containing
230 * anything except a comment. */
231 tok->read_coding_spec = 1;
232 break;
233 }
234 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700235 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200236 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 tok->read_coding_spec = 1;
238 if (tok->encoding == NULL) {
239 assert(tok->decoding_state == STATE_RAW);
240 if (strcmp(cs, "utf-8") == 0) {
241 tok->encoding = cs;
242 } else {
243 r = set_readline(tok, cs);
244 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700248 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300249 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 "encoding problem: %s", cs);
251 PyMem_FREE(cs);
252 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000253 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700254 } else { /* then, compare cs with BOM */
255 r = (strcmp(tok->encoding, cs) == 0);
256 if (!r)
257 PyErr_Format(PyExc_SyntaxError,
258 "encoding problem: %s with BOM", cs);
259 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262}
263
264/* See whether the file starts with a BOM. If it does,
265 invoke the set_readline function with the new encoding.
266 Return 1 on success, 0 on failure. */
267
268static int
269check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 void unget_char(int, struct tok_state *),
271 int set_readline(struct tok_state *, const char *),
272 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 int ch1, ch2, ch3;
275 ch1 = get_char(tok);
276 tok->decoding_state = STATE_RAW;
277 if (ch1 == EOF) {
278 return 1;
279 } else if (ch1 == 0xEF) {
280 ch2 = get_char(tok);
281 if (ch2 != 0xBB) {
282 unget_char(ch2, tok);
283 unget_char(ch1, tok);
284 return 1;
285 }
286 ch3 = get_char(tok);
287 if (ch3 != 0xBF) {
288 unget_char(ch3, tok);
289 unget_char(ch2, tok);
290 unget_char(ch1, tok);
291 return 1;
292 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000294 /* Disable support for UTF-16 BOMs until a decision
295 is made whether this needs to be supported. */
296 } else if (ch1 == 0xFE) {
297 ch2 = get_char(tok);
298 if (ch2 != 0xFF) {
299 unget_char(ch2, tok);
300 unget_char(ch1, tok);
301 return 1;
302 }
303 if (!set_readline(tok, "utf-16-be"))
304 return 0;
305 tok->decoding_state = STATE_NORMAL;
306 } else if (ch1 == 0xFF) {
307 ch2 = get_char(tok);
308 if (ch2 != 0xFE) {
309 unget_char(ch2, tok);
310 unget_char(ch1, tok);
311 return 1;
312 }
313 if (!set_readline(tok, "utf-16-le"))
314 return 0;
315 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000316#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 } else {
318 unget_char(ch1, tok);
319 return 1;
320 }
321 if (tok->encoding != NULL)
322 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700323 tok->encoding = new_string("utf-8", 5, tok);
324 if (!tok->encoding)
325 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 /* No need to set_readline: input is already utf-8 */
327 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328}
329
330/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000331 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000333 On entry, tok->decoding_buffer will be one of:
334 1) NULL: need to call tok->decoding_readline to get a new line
335 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000336 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000337 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 (in the s buffer) to copy entire contents of the line read
339 by tok->decoding_readline. tok->decoding_buffer has the overflow.
340 In this case, fp_readl is called in a loop (with an expanded buffer)
341 until the buffer ends with a '\n' (or until the end of the file is
342 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344
345static char *
346fp_readl(char *s, int size, struct tok_state *tok)
347{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 PyObject* bufobj;
349 const char *buf;
350 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 /* Ask for one less byte so we can terminate it */
353 assert(size > 0);
354 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 if (tok->decoding_buffer) {
357 bufobj = tok->decoding_buffer;
358 Py_INCREF(bufobj);
359 }
360 else
361 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100362 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000363 if (bufobj == NULL)
364 goto error;
365 }
366 if (PyUnicode_CheckExact(bufobj))
367 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200368 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000369 if (buf == NULL) {
370 goto error;
371 }
372 }
373 else
374 {
375 buf = PyByteArray_AsString(bufobj);
376 if (buf == NULL) {
377 goto error;
378 }
379 buflen = PyByteArray_GET_SIZE(bufobj);
380 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 Py_XDECREF(tok->decoding_buffer);
383 if (buflen > size) {
384 /* Too many chars, the rest goes into tok->decoding_buffer */
385 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
388 goto error;
389 buflen = size;
390 }
391 else
392 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 memcpy(s, buf, buflen);
395 s[buflen] = '\0';
396 if (buflen == 0) /* EOF */
397 s = NULL;
398 Py_DECREF(bufobj);
399 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000400
401error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000402 Py_XDECREF(bufobj);
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404}
405
406/* Set the readline function for TOK to a StreamReader's
407 readline function. The StreamReader is named ENC.
408
409 This function is called from check_bom and check_coding_spec.
410
411 ENC is usually identical to the future value of tok->encoding,
412 except for the (currently unsupported) case of UTF-16.
413
414 Return 1 on success, 0 on failure. */
415
416static int
417fp_setreadl(struct tok_state *tok, const char* enc)
418{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700419 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200420 _Py_IDENTIFIER(open);
421 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000422 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200423 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
Victor Stinner22a351a2010-10-14 12:04:34 +0000425 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200426 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100427 * position of tok->fp. If tok->fp was opened in text mode on Windows,
428 * its file position counts CRLF as one char and can't be directly mapped
429 * to the file offset for fd. Instead we step back one byte and read to
430 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200431 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100432 if (pos == -1 ||
433 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000434 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700435 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000436 }
437
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700438 io = PyImport_ImportModuleNoBlock("io");
439 if (io == NULL)
440 return 0;
441
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200442 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000443 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000445 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700446 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200448 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449 Py_DECREF(stream);
450 if (readline == NULL)
451 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300452 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700453
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100454 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100455 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 if (bufobj == NULL)
457 return 0;
458 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100459 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000460
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462}
463
464/* Fetch the next byte from TOK. */
465
466static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468}
469
470/* Unfetch the last byte back into TOK. */
471
472static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000476/* Check whether the characters at s start a valid
477 UTF-8 sequence. Return the number of characters forming
478 the sequence if yes, 0 if not. */
479static int valid_utf8(const unsigned char* s)
480{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 int expected = 0;
482 int length;
483 if (*s < 0x80)
484 /* single-byte code */
485 return 1;
486 if (*s < 0xc0)
487 /* following byte */
488 return 0;
489 if (*s < 0xE0)
490 expected = 1;
491 else if (*s < 0xF0)
492 expected = 2;
493 else if (*s < 0xF8)
494 expected = 3;
495 else
496 return 0;
497 length = expected + 1;
498 for (; expected; expected--)
499 if (s[expected] < 0x80 || s[expected] >= 0xC0)
500 return 0;
501 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000502}
503
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504/* Read a line of input from TOK. Determine encoding
505 if necessary. */
506
507static char *
508decoding_fgets(char *s, int size, struct tok_state *tok)
509{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 char *line = NULL;
511 int badchar = 0;
512 for (;;) {
513 if (tok->decoding_state == STATE_NORMAL) {
514 /* We already have a codec associated with
515 this input. */
516 line = fp_readl(s, size, tok);
517 break;
518 } else if (tok->decoding_state == STATE_RAW) {
519 /* We want a 'raw' read. */
520 line = Py_UniversalNewlineFgets(s, size,
521 tok->fp, NULL);
522 break;
523 } else {
524 /* We have not yet determined the encoding.
525 If an encoding is found, use the file-pointer
526 reader functions from now on. */
527 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
528 return error_ret(tok);
529 assert(tok->decoding_state != STATE_INIT);
530 }
531 }
532 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
533 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
534 return error_ret(tok);
535 }
536 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 /* The default encoding is UTF-8, so make sure we don't have any
538 non-UTF-8 sequences in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 int length;
542 for (c = (unsigned char *)line; *c; c += length)
543 if (!(length = valid_utf8(c))) {
544 badchar = *c;
545 break;
546 }
547 }
548 if (badchar) {
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200551 PyErr_Format(PyExc_SyntaxError,
552 "Non-UTF-8 code starting with '\\x%.2x' "
553 "in file %U on line %i, "
554 "but no encoding declared; "
555 "see http://python.org/dev/peps/pep-0263/ for details",
556 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 return error_ret(tok);
558 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562static int
563decoding_feof(struct tok_state *tok)
564{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000565 if (tok->decoding_state != STATE_NORMAL) {
566 return feof(tok->fp);
567 } else {
568 PyObject* buf = tok->decoding_buffer;
569 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100570 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 if (buf == NULL) {
572 error_ret(tok);
573 return 1;
574 } else {
575 tok->decoding_buffer = buf;
576 }
577 }
578 return PyObject_Length(buf) == 0;
579 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580}
581
582/* Fetch a byte from TOK, using the string buffer. */
583
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000584static int
585buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587}
588
589/* Unfetch a byte from TOK, using the string buffer. */
590
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000591static void
592buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000593 tok->str--;
594 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595}
596
597/* Set the readline function for TOK to ENC. For the string-based
598 tokenizer, this means to just record the encoding. */
599
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000600static int
601buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 tok->enc = enc;
603 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604}
605
606/* Return a UTF-8 encoding Python string object from the
607 C byte string STR, which is encoded with ENC. */
608
609static PyObject *
610translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000611 PyObject *utf8;
612 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
613 if (buf == NULL)
614 return NULL;
615 utf8 = PyUnicode_AsUTF8String(buf);
616 Py_DECREF(buf);
617 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000620
621static char *
622translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200623 int skip_next_lf = 0;
624 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 char *buf, *current;
626 char c = '\0';
627 buf = PyMem_MALLOC(needed_length);
628 if (buf == NULL) {
629 tok->done = E_NOMEM;
630 return NULL;
631 }
632 for (current = buf; *s; s++, current++) {
633 c = *s;
634 if (skip_next_lf) {
635 skip_next_lf = 0;
636 if (c == '\n') {
637 c = *++s;
638 if (!c)
639 break;
640 }
641 }
642 if (c == '\r') {
643 skip_next_lf = 1;
644 c = '\n';
645 }
646 *current = c;
647 }
648 /* If this is exec input, add a newline to the end of the string if
649 there isn't one already. */
650 if (exec_input && c != '\n') {
651 *current = '\n';
652 current++;
653 }
654 *current = '\0';
655 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000656 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 /* should never fail */
Pablo Galindocb90c892019-03-19 17:17:58 +0000658 char* result = PyMem_REALLOC(buf, final_length);
659 if (result == NULL) {
660 PyMem_FREE(buf);
661 }
662 buf = result;
663 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000665}
666
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667/* Decode a byte string STR for use as the buffer of TOK.
668 Look for encoding declarations inside STR, and record them
669 inside TOK. */
670
Andy Lester384f3c52020-02-27 20:44:52 -0600671static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000672decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000673{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600675 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 const char *s;
677 const char *newl[2] = {NULL, NULL};
678 int lineno = 0;
679 tok->input = str = translate_newlines(input, single, tok);
680 if (str == NULL)
681 return NULL;
682 tok->enc = NULL;
683 tok->str = str;
684 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
685 return error_ret(tok);
686 str = tok->str; /* string after BOM if any */
687 assert(str);
688 if (tok->enc != NULL) {
689 utf8 = translate_into_utf8(str, tok->enc);
690 if (utf8 == NULL)
691 return error_ret(tok);
692 str = PyBytes_AsString(utf8);
693 }
694 for (s = str;; s++) {
695 if (*s == '\0') break;
696 else if (*s == '\n') {
697 assert(lineno < 2);
698 newl[lineno] = s;
699 lineno++;
700 if (lineno == 2) break;
701 }
702 }
703 tok->enc = NULL;
704 /* need to check line 1 and 2 separately since check_coding_spec
705 assumes a single line as input */
706 if (newl[0]) {
707 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200709 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711 tok, buf_setreadl))
712 return error_ret(tok);
713 }
714 }
715 if (tok->enc != NULL) {
716 assert(utf8 == NULL);
717 utf8 = translate_into_utf8(str, tok->enc);
718 if (utf8 == NULL)
719 return error_ret(tok);
720 str = PyBytes_AS_STRING(utf8);
721 }
722 assert(tok->decoding_buffer == NULL);
723 tok->decoding_buffer = utf8; /* CAUTION */
724 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000725}
726
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727/* Set up tokenizer for string */
728
729struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000730PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600733 char *decoded;
734
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 if (tok == NULL)
736 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600737 decoded = decode_str(str, exec_input, tok);
738 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyTokenizer_Free(tok);
740 return NULL;
741 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000742
Andy Lester384f3c52020-02-27 20:44:52 -0600743 tok->buf = tok->cur = tok->inp = decoded;
744 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746}
747
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000749PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000750{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600752 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 if (tok == NULL)
754 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600755 tok->input = translated = translate_newlines(str, exec_input, tok);
756 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 PyTokenizer_Free(tok);
758 return NULL;
759 }
760 tok->decoding_state = STATE_RAW;
761 tok->read_coding_spec = 1;
762 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600763 tok->str = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 tok->encoding = (char *)PyMem_MALLOC(6);
765 if (!tok->encoding) {
766 PyTokenizer_Free(tok);
767 return NULL;
768 }
769 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000770
Andy Lester384f3c52020-02-27 20:44:52 -0600771 tok->buf = tok->cur = tok->inp = translated;
772 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000774}
775
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000776/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777
778struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300779PyTokenizer_FromFile(FILE *fp, const char* enc,
780 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 struct tok_state *tok = tok_new();
783 if (tok == NULL)
784 return NULL;
785 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
786 PyTokenizer_Free(tok);
787 return NULL;
788 }
789 tok->cur = tok->inp = tok->buf;
790 tok->end = tok->buf + BUFSIZ;
791 tok->fp = fp;
792 tok->prompt = ps1;
793 tok->nextprompt = ps2;
794 if (enc != NULL) {
795 /* Must copy encoding declaration since it
796 gets copied into the parse tree. */
797 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
798 if (!tok->encoding) {
799 PyTokenizer_Free(tok);
800 return NULL;
801 }
802 strcpy(tok->encoding, enc);
803 tok->decoding_state = STATE_NORMAL;
804 }
805 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806}
807
808
809/* Free a tok_state structure */
810
811void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000812PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 if (tok->encoding != NULL)
815 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 Py_XDECREF(tok->decoding_readline);
817 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200818 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 if (tok->fp != NULL && tok->buf != NULL)
820 PyMem_FREE(tok->buf);
821 if (tok->input)
Andy Lester384f3c52020-02-27 20:44:52 -0600822 PyMem_FREE(tok->input);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824}
825
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826/* Get next char, updating state; error code goes into tok->done */
827
828static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200829tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000830{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 for (;;) {
832 if (tok->cur != tok->inp) {
833 return Py_CHARMASK(*tok->cur++); /* Fast path */
834 }
835 if (tok->done != E_OK)
836 return EOF;
837 if (tok->fp == NULL) {
838 char *end = strchr(tok->inp, '\n');
839 if (end != NULL)
840 end++;
841 else {
842 end = strchr(tok->inp, '\0');
843 if (end == tok->inp) {
844 tok->done = E_EOF;
845 return EOF;
846 }
847 }
848 if (tok->start == NULL)
849 tok->buf = tok->cur;
850 tok->line_start = tok->cur;
851 tok->lineno++;
852 tok->inp = end;
853 return Py_CHARMASK(*tok->cur++);
854 }
855 if (tok->prompt != NULL) {
856 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000857 if (newtok != NULL) {
858 char *translated = translate_newlines(newtok, 0, tok);
859 PyMem_FREE(newtok);
860 if (translated == NULL)
861 return EOF;
862 newtok = translated;
863 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 if (tok->encoding && newtok && *newtok) {
865 /* Recode to UTF-8 */
866 Py_ssize_t buflen;
867 const char* buf;
868 PyObject *u = translate_into_utf8(newtok, tok->encoding);
869 PyMem_FREE(newtok);
870 if (!u) {
871 tok->done = E_DECODE;
872 return EOF;
873 }
874 buflen = PyBytes_GET_SIZE(u);
875 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700877 if (newtok == NULL) {
878 Py_DECREF(u);
879 tok->done = E_NOMEM;
880 return EOF;
881 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 strcpy(newtok, buf);
883 Py_DECREF(u);
884 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 if (tok->nextprompt != NULL)
886 tok->prompt = tok->nextprompt;
887 if (newtok == NULL)
888 tok->done = E_INTR;
889 else if (*newtok == '\0') {
890 PyMem_FREE(newtok);
891 tok->done = E_EOF;
892 }
893 else if (tok->start != NULL) {
894 size_t start = tok->start - tok->buf;
895 size_t oldlen = tok->cur - tok->buf;
896 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000897 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 char *buf = tok->buf;
899 buf = (char *)PyMem_REALLOC(buf, newlen+1);
900 tok->lineno++;
901 if (buf == NULL) {
902 PyMem_FREE(tok->buf);
903 tok->buf = NULL;
904 PyMem_FREE(newtok);
905 tok->done = E_NOMEM;
906 return EOF;
907 }
908 tok->buf = buf;
909 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000910 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 tok->line_start = tok->cur;
912 strcpy(tok->buf + oldlen, newtok);
913 PyMem_FREE(newtok);
914 tok->inp = tok->buf + newlen;
915 tok->end = tok->inp + 1;
916 tok->start = tok->buf + start;
917 }
918 else {
919 tok->lineno++;
920 if (tok->buf != NULL)
921 PyMem_FREE(tok->buf);
922 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 tok->cur = tok->buf;
924 tok->line_start = tok->buf;
925 tok->inp = strchr(tok->buf, '\0');
926 tok->end = tok->inp + 1;
927 }
928 }
929 else {
930 int done = 0;
931 Py_ssize_t cur = 0;
932 char *pt;
933 if (tok->start == NULL) {
934 if (tok->buf == NULL) {
935 tok->buf = (char *)
936 PyMem_MALLOC(BUFSIZ);
937 if (tok->buf == NULL) {
938 tok->done = E_NOMEM;
939 return EOF;
940 }
941 tok->end = tok->buf + BUFSIZ;
942 }
943 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
944 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200945 if (!tok->decoding_erred)
946 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000947 done = 1;
948 }
949 else {
950 tok->done = E_OK;
951 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700952 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 }
954 }
955 else {
956 cur = tok->cur - tok->buf;
957 if (decoding_feof(tok)) {
958 tok->done = E_EOF;
959 done = 1;
960 }
961 else
962 tok->done = E_OK;
963 }
964 tok->lineno++;
965 /* Read until '\n' or EOF */
966 while (!done) {
967 Py_ssize_t curstart = tok->start == NULL ? -1 :
968 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700969 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 Py_ssize_t curvalid = tok->inp - tok->buf;
971 Py_ssize_t newsize = curvalid + BUFSIZ;
972 char *newbuf = tok->buf;
973 newbuf = (char *)PyMem_REALLOC(newbuf,
974 newsize);
975 if (newbuf == NULL) {
976 tok->done = E_NOMEM;
977 tok->cur = tok->inp;
978 return EOF;
979 }
980 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200981 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700982 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200983 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000984 tok->inp = tok->buf + curvalid;
985 tok->end = tok->buf + newsize;
986 tok->start = curstart < 0 ? NULL :
987 tok->buf + curstart;
988 if (decoding_fgets(tok->inp,
989 (int)(tok->end - tok->inp),
990 tok) == NULL) {
991 /* Break out early on decoding
992 errors, as tok->buf will be NULL
993 */
994 if (tok->decoding_erred)
995 return EOF;
996 /* Last line does not end in \n,
997 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -0700998 if (tok->inp[-1] != '\n')
999 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 }
1001 tok->inp = strchr(tok->inp, '\0');
1002 done = tok->inp[-1] == '\n';
1003 }
1004 if (tok->buf != NULL) {
1005 tok->cur = tok->buf + cur;
1006 tok->line_start = tok->cur;
1007 /* replace "\r\n" with "\n" */
1008 /* For Mac leave the \r, giving a syntax error */
1009 pt = tok->inp - 2;
1010 if (pt >= tok->buf && *pt == '\r') {
1011 *pt++ = '\n';
1012 *pt = '\0';
1013 tok->inp = pt;
1014 }
1015 }
1016 }
1017 if (tok->done != E_OK) {
1018 if (tok->prompt != NULL)
1019 PySys_WriteStderr("\n");
1020 tok->cur = tok->inp;
1021 return EOF;
1022 }
1023 }
1024 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025}
1026
1027
1028/* Back-up one character */
1029
1030static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001031tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 if (c != EOF) {
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001034 if (--tok->cur < tok->buf) {
Victor Stinner87d3b9d2020-03-25 19:27:36 +01001035 Py_FatalError("tokenizer beginning of buffer");
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001036 }
1037 if (*tok->cur != c) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 *tok->cur = c;
Victor Stinner9e5d30c2020-03-07 00:54:20 +01001039 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041}
1042
1043
Guido van Rossum926f13a1998-04-09 21:38:06 +00001044static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001045syntaxerror(struct tok_state *tok, const char *format, ...)
1046{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001047 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001048 va_list vargs;
1049#ifdef HAVE_STDARG_PROTOTYPES
1050 va_start(vargs, format);
1051#else
1052 va_start(vargs);
1053#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001054 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001055 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001056 if (!errmsg) {
1057 goto error;
1058 }
1059
1060 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1061 "replace");
1062 if (!errtext) {
1063 goto error;
1064 }
1065 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1066 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1067 if (line_len != tok->cur - tok->line_start) {
1068 Py_DECREF(errtext);
1069 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1070 "replace");
1071 }
1072 if (!errtext) {
1073 goto error;
1074 }
1075
1076 args = Py_BuildValue("(O(OiiN))", errmsg,
1077 tok->filename, tok->lineno, offset, errtext);
1078 if (args) {
1079 PyErr_SetObject(PyExc_SyntaxError, args);
1080 Py_DECREF(args);
1081 }
1082
1083error:
1084 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001085 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001086 return ERRORTOKEN;
1087}
1088
1089static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001090indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001091{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001092 tok->done = E_TABSPACE;
1093 tok->cur = tok->inp;
1094 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095}
1096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097/* Verify that the identifier follows PEP 3131.
1098 All identifier strings are guaranteed to be "ready" unicode objects.
1099 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001100static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001101verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001102{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 PyObject *s;
1104 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001105 if (tok->decoding_erred)
1106 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001108 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1110 PyErr_Clear();
1111 tok->done = E_IDENTIFIER;
1112 } else {
1113 tok->done = E_ERROR;
1114 }
1115 return 0;
1116 }
1117 result = PyUnicode_IsIdentifier(s);
1118 Py_DECREF(s);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001119 if (result == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 tok->done = E_IDENTIFIER;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001121 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001123}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001124
Brett Cannona721aba2016-09-09 14:57:09 -07001125static int
1126tok_decimal_tail(struct tok_state *tok)
1127{
1128 int c;
1129
1130 while (1) {
1131 do {
1132 c = tok_nextc(tok);
1133 } while (isdigit(c));
1134 if (c != '_') {
1135 break;
1136 }
1137 c = tok_nextc(tok);
1138 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001139 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001140 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001141 return 0;
1142 }
1143 }
1144 return c;
1145}
1146
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001147/* Get next token, after space stripping etc. */
1148
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001149static int
Andy Lester384f3c52020-02-27 20:44:52 -06001150tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001151{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001152 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001156 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 tok->start = NULL;
1158 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001159
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 /* Get indentation level */
1161 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001162 int col = 0;
1163 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 tok->atbol = 0;
1165 for (;;) {
1166 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001167 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001169 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001171 col = (col / tok->tabsize + 1) * tok->tabsize;
1172 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 }
Brett Cannona721aba2016-09-09 14:57:09 -07001174 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001176 }
1177 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001179 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 }
1181 tok_backup(tok, c);
1182 if (c == '#' || c == '\n') {
1183 /* Lines with only whitespace and/or comments
1184 shouldn't affect the indentation and are
1185 not passed to the parser as NEWLINE tokens,
1186 except *totally* empty lines in interactive
1187 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001188 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001190 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001191 else if (tok->prompt != NULL && tok->lineno == 1) {
1192 /* In interactive mode, if the first line contains
1193 only spaces and/or a comment, let it through. */
1194 blankline = 0;
1195 col = altcol = 0;
1196 }
Brett Cannona721aba2016-09-09 14:57:09 -07001197 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001199 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 /* We can't jump back right here since we still
1201 may need to skip to the end of a comment */
1202 }
1203 if (!blankline && tok->level == 0) {
1204 if (col == tok->indstack[tok->indent]) {
1205 /* No change */
1206 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001207 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 }
1209 }
1210 else if (col > tok->indstack[tok->indent]) {
1211 /* Indent -- always one */
1212 if (tok->indent+1 >= MAXINDENT) {
1213 tok->done = E_TOODEEP;
1214 tok->cur = tok->inp;
1215 return ERRORTOKEN;
1216 }
1217 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001218 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 }
1220 tok->pendin++;
1221 tok->indstack[++tok->indent] = col;
1222 tok->altindstack[tok->indent] = altcol;
1223 }
1224 else /* col < tok->indstack[tok->indent] */ {
1225 /* Dedent -- any number, must be consistent */
1226 while (tok->indent > 0 &&
1227 col < tok->indstack[tok->indent]) {
1228 tok->pendin--;
1229 tok->indent--;
1230 }
1231 if (col != tok->indstack[tok->indent]) {
1232 tok->done = E_DEDENT;
1233 tok->cur = tok->inp;
1234 return ERRORTOKEN;
1235 }
1236 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001237 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 }
1239 }
1240 }
1241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001242
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 /* Return pending indents/dedents */
1246 if (tok->pendin != 0) {
1247 if (tok->pendin < 0) {
1248 tok->pendin++;
1249 return DEDENT;
1250 }
1251 else {
1252 tok->pendin--;
1253 return INDENT;
1254 }
1255 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001256
Guido van Rossum495da292019-03-07 12:38:08 -08001257 /* Peek ahead at the next character */
1258 c = tok_nextc(tok);
1259 tok_backup(tok, c);
1260 /* Check if we are closing an async function */
1261 if (tok->async_def
1262 && !blankline
1263 /* Due to some implementation artifacts of type comments,
1264 * a TYPE_COMMENT at the start of a function won't set an
1265 * indentation level and it will produce a NEWLINE after it.
1266 * To avoid spuriously ending an async function due to this,
1267 * wait until we have some non-newline char in front of us. */
1268 && c != '\n'
1269 && tok->level == 0
1270 /* There was a NEWLINE after ASYNC DEF,
1271 so we're past the signature. */
1272 && tok->async_def_nl
1273 /* Current indentation level is less than where
1274 the async function was defined */
1275 && tok->async_def_indent >= tok->indent)
1276 {
1277 tok->async_def = 0;
1278 tok->async_def_indent = 0;
1279 tok->async_def_nl = 0;
1280 }
1281
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 tok->start = NULL;
1284 /* Skip spaces */
1285 do {
1286 c = tok_nextc(tok);
1287 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001288
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 /* Set start of current token */
1290 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001291
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001292 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001293 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001294 const char *prefix, *p, *type_start;
1295
Brett Cannona721aba2016-09-09 14:57:09 -07001296 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001298 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001299
1300 if (tok->type_comments) {
1301 p = tok->start;
1302 prefix = type_comment_prefix;
1303 while (*prefix && p < tok->cur) {
1304 if (*prefix == ' ') {
1305 while (*p == ' ' || *p == '\t') {
1306 p++;
1307 }
1308 } else if (*prefix == *p) {
1309 p++;
1310 } else {
1311 break;
1312 }
1313
1314 prefix++;
1315 }
1316
1317 /* This is a type comment if we matched all of type_comment_prefix. */
1318 if (!*prefix) {
1319 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001320 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001321 tok_backup(tok, c); /* don't eat the newline or EOF */
1322
1323 type_start = p;
1324
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001325 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001326 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001327 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001328 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001329 && !(tok->cur > ignore_end
1330 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001331
1332 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001333 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001334 *p_end = tok->cur;
1335
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001336 /* If this type ignore is the only thing on the line, consume the newline also. */
1337 if (blankline) {
1338 tok_nextc(tok);
1339 tok->atbol = 1;
1340 }
1341 return TYPE_IGNORE;
1342 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001343 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001344 *p_end = tok->cur;
1345 return TYPE_COMMENT;
1346 }
1347 }
1348 }
Brett Cannona721aba2016-09-09 14:57:09 -07001349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001350
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 /* Check for EOF and errors now */
1352 if (c == EOF) {
1353 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1354 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 /* Identifier (most frequent token!) */
1357 nonascii = 0;
1358 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001359 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001360 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001361 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001362 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001363 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001364 /* Since this is a backwards compatibility support literal we don't
1365 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001366 else if (!(saw_b || saw_u || saw_r || saw_f)
1367 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001368 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001369 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001370 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001371 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001372 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001373 }
1374 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001375 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001376 }
1377 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001378 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001379 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001381 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001383 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 }
1385 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001386 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001388 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 c = tok_nextc(tok);
1390 }
1391 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001392 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001394 }
Pablo Galindo11a7f152020-04-21 01:53:04 +01001395
1396 *p_start = tok->start;
1397 *p_end = tok->cur;
1398
Guido van Rossum495da292019-03-07 12:38:08 -08001399 /* async/await parsing block. */
1400 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1401 /* May be an 'async' or 'await' token. For Python 3.7 or
1402 later we recognize them unconditionally. For Python
1403 3.5 or 3.6 we recognize 'async' in front of 'def', and
1404 either one inside of 'async def'. (Technically we
1405 shouldn't recognize these at all for 3.4 or earlier,
1406 but there's no *valid* Python 3.4 code that would be
1407 rejected, and async functions will be rejected in a
1408 later phase.) */
1409 if (!tok->async_hacks || tok->async_def) {
1410 /* Always recognize the keywords. */
1411 if (memcmp(tok->start, "async", 5) == 0) {
1412 return ASYNC;
1413 }
1414 if (memcmp(tok->start, "await", 5) == 0) {
1415 return AWAIT;
1416 }
1417 }
1418 else if (memcmp(tok->start, "async", 5) == 0) {
1419 /* The current token is 'async'.
1420 Look ahead one token to see if that is 'def'. */
1421
1422 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001423 const char *ahead_tok_start = NULL;
1424 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001425 int ahead_tok_kind;
1426
1427 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1428 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1429 &ahead_tok_end);
1430
1431 if (ahead_tok_kind == NAME
1432 && ahead_tok.cur - ahead_tok.start == 3
1433 && memcmp(ahead_tok.start, "def", 3) == 0)
1434 {
1435 /* The next token is going to be 'def', so instead of
1436 returning a plain NAME token, return ASYNC. */
1437 tok->async_def_indent = tok->indent;
1438 tok->async_def = 1;
1439 return ASYNC;
1440 }
1441 }
1442 }
1443
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 return NAME;
1445 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 /* Newline */
1448 if (c == '\n') {
1449 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001450 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001452 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453 *p_start = tok->start;
1454 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1455 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001456 if (tok->async_def) {
1457 /* We're somewhere inside an 'async def' function, and
1458 we've encountered a NEWLINE after its signature. */
1459 tok->async_def_nl = 1;
1460 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 return NEWLINE;
1462 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 /* Period or number starting with period? */
1465 if (c == '.') {
1466 c = tok_nextc(tok);
1467 if (isdigit(c)) {
1468 goto fraction;
1469 } else if (c == '.') {
1470 c = tok_nextc(tok);
1471 if (c == '.') {
1472 *p_start = tok->start;
1473 *p_end = tok->cur;
1474 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001475 }
1476 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 tok_backup(tok, c);
1478 }
1479 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001480 }
1481 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 tok_backup(tok, c);
1483 }
1484 *p_start = tok->start;
1485 *p_end = tok->cur;
1486 return DOT;
1487 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001488
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001489 /* Number */
1490 if (isdigit(c)) {
1491 if (c == '0') {
1492 /* Hex, octal or binary -- maybe. */
1493 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001494 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 /* Hex */
1496 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001498 if (c == '_') {
1499 c = tok_nextc(tok);
1500 }
1501 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001502 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001503 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001504 }
1505 do {
1506 c = tok_nextc(tok);
1507 } while (isxdigit(c));
1508 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001509 }
1510 else if (c == 'o' || c == 'O') {
1511 /* Octal */
1512 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001514 if (c == '_') {
1515 c = tok_nextc(tok);
1516 }
1517 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001518 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001519 if (isdigit(c)) {
1520 return syntaxerror(tok,
1521 "invalid digit '%c' in octal literal", c);
1522 }
1523 else {
1524 return syntaxerror(tok, "invalid octal literal");
1525 }
Brett Cannona721aba2016-09-09 14:57:09 -07001526 }
1527 do {
1528 c = tok_nextc(tok);
1529 } while ('0' <= c && c < '8');
1530 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001531 if (isdigit(c)) {
1532 return syntaxerror(tok,
1533 "invalid digit '%c' in octal literal", c);
1534 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001535 }
1536 else if (c == 'b' || c == 'B') {
1537 /* Binary */
1538 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001540 if (c == '_') {
1541 c = tok_nextc(tok);
1542 }
1543 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001544 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001545 if (isdigit(c)) {
1546 return syntaxerror(tok,
1547 "invalid digit '%c' in binary literal", c);
1548 }
1549 else {
1550 return syntaxerror(tok, "invalid binary literal");
1551 }
Brett Cannona721aba2016-09-09 14:57:09 -07001552 }
1553 do {
1554 c = tok_nextc(tok);
1555 } while (c == '0' || c == '1');
1556 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001557 if (isdigit(c)) {
1558 return syntaxerror(tok,
1559 "invalid digit '%c' in binary literal", c);
1560 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 }
1562 else {
1563 int nonzero = 0;
1564 /* maybe old-style octal; c is first char of it */
1565 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001566 while (1) {
1567 if (c == '_') {
1568 c = tok_nextc(tok);
1569 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001570 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001571 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001572 }
1573 }
1574 if (c != '0') {
1575 break;
1576 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001577 c = tok_nextc(tok);
1578 }
Brett Cannona721aba2016-09-09 14:57:09 -07001579 if (isdigit(c)) {
1580 nonzero = 1;
1581 c = tok_decimal_tail(tok);
1582 if (c == 0) {
1583 return ERRORTOKEN;
1584 }
1585 }
1586 if (c == '.') {
1587 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001588 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001589 }
1590 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001592 }
1593 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001595 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001597 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001599 return syntaxerror(tok,
1600 "leading zeros in decimal integer "
1601 "literals are not permitted; "
1602 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 }
1604 }
1605 }
1606 else {
1607 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001608 c = tok_decimal_tail(tok);
1609 if (c == 0) {
1610 return ERRORTOKEN;
1611 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 {
1613 /* Accept floating point numbers. */
1614 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001615 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001616 fraction:
1617 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001618 if (isdigit(c)) {
1619 c = tok_decimal_tail(tok);
1620 if (c == 0) {
1621 return ERRORTOKEN;
1622 }
1623 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 }
1625 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001626 int e;
1627 exponent:
1628 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 /* Exponent part */
1630 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001631 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001632 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001633 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001634 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001635 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001636 }
1637 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001639 tok_backup(tok, e);
1640 *p_start = tok->start;
1641 *p_end = tok->cur;
1642 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 }
Brett Cannona721aba2016-09-09 14:57:09 -07001644 c = tok_decimal_tail(tok);
1645 if (c == 0) {
1646 return ERRORTOKEN;
1647 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 }
Brett Cannona721aba2016-09-09 14:57:09 -07001649 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 /* Imaginary part */
1651 imaginary:
1652 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001653 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 }
1655 }
1656 tok_backup(tok, c);
1657 *p_start = tok->start;
1658 *p_end = tok->cur;
1659 return NUMBER;
1660 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001661
1662 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 /* String */
1664 if (c == '\'' || c == '"') {
1665 int quote = c;
1666 int quote_size = 1; /* 1 or 3 */
1667 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001668
Anthony Sottile995d9b92019-01-12 20:05:13 -08001669 /* Nodes of type STRING, especially multi line strings
1670 must be handled differently in order to get both
1671 the starting line number and the column offset right.
1672 (cf. issue 16806) */
1673 tok->first_lineno = tok->lineno;
1674 tok->multi_line_start = tok->line_start;
1675
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676 /* Find the quote size and start of string */
1677 c = tok_nextc(tok);
1678 if (c == quote) {
1679 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001680 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001682 }
1683 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001685 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 }
Brett Cannona721aba2016-09-09 14:57:09 -07001687 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001689 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001690
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691 /* Get rest of string */
1692 while (end_quote_size != quote_size) {
1693 c = tok_nextc(tok);
1694 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001695 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001697 }
1698 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001700 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 tok->cur = tok->inp;
1702 return ERRORTOKEN;
1703 }
1704 if (quote_size == 1 && c == '\n') {
1705 tok->done = E_EOLS;
1706 tok->cur = tok->inp;
1707 return ERRORTOKEN;
1708 }
Brett Cannona721aba2016-09-09 14:57:09 -07001709 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001711 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001712 else {
1713 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001714 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001715 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001716 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 }
1718 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001719
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 *p_start = tok->start;
1721 *p_end = tok->cur;
1722 return STRING;
1723 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001724
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001725 /* Line continuation */
1726 if (c == '\\') {
1727 c = tok_nextc(tok);
1728 if (c != '\n') {
1729 tok->done = E_LINECONT;
1730 tok->cur = tok->inp;
1731 return ERRORTOKEN;
1732 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001733 c = tok_nextc(tok);
1734 if (c == EOF) {
1735 tok->done = E_EOF;
1736 tok->cur = tok->inp;
1737 return ERRORTOKEN;
1738 } else {
1739 tok_backup(tok, c);
1740 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001741 tok->cont_line = 1;
1742 goto again; /* Read next line */
1743 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001744
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 /* Check for two-character token */
1746 {
1747 int c2 = tok_nextc(tok);
1748 int token = PyToken_TwoChars(c, c2);
1749 if (token != OP) {
1750 int c3 = tok_nextc(tok);
1751 int token3 = PyToken_ThreeChars(c, c2, c3);
1752 if (token3 != OP) {
1753 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001754 }
1755 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 tok_backup(tok, c3);
1757 }
1758 *p_start = tok->start;
1759 *p_end = tok->cur;
1760 return token;
1761 }
1762 tok_backup(tok, c2);
1763 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001764
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 /* Keep track of parentheses nesting level */
1766 switch (c) {
1767 case '(':
1768 case '[':
1769 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001770 if (tok->level >= MAXLEVEL) {
1771 return syntaxerror(tok, "too many nested parentheses");
1772 }
1773 tok->parenstack[tok->level] = c;
1774 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001775 tok->level++;
1776 break;
1777 case ')':
1778 case ']':
1779 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001780 if (!tok->level) {
1781 return syntaxerror(tok, "unmatched '%c'", c);
1782 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001784 int opening = tok->parenstack[tok->level];
1785 if (!((opening == '(' && c == ')') ||
1786 (opening == '[' && c == ']') ||
1787 (opening == '{' && c == '}')))
1788 {
1789 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1790 return syntaxerror(tok,
1791 "closing parenthesis '%c' does not match "
1792 "opening parenthesis '%c' on line %d",
1793 c, opening, tok->parenlinenostack[tok->level]);
1794 }
1795 else {
1796 return syntaxerror(tok,
1797 "closing parenthesis '%c' does not match "
1798 "opening parenthesis '%c'",
1799 c, opening);
1800 }
1801 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 break;
1803 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001804
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001805 /* Punctuation character */
1806 *p_start = tok->start;
1807 *p_end = tok->cur;
1808 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001809}
1810
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001811int
Andy Lester384f3c52020-02-27 20:44:52 -06001812PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001814 int result = tok_get(tok, p_start, p_end);
1815 if (tok->decoding_erred) {
1816 result = ERRORTOKEN;
1817 tok->done = E_DECODE;
1818 }
1819 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001820}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001821
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001822/* Get the encoding of a Python file. Check for the coding cookie and check if
1823 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001824
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001825 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1826 encoding in the first or second line of the file (in which case the encoding
1827 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001828
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001829 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1830 by the caller. */
1831
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001832char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001833PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001834{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001835 struct tok_state *tok;
1836 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001837 const char *p_start = NULL;
1838 const char *p_end = NULL;
1839 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001840
Victor Stinnerdaf45552013-08-28 00:53:59 +02001841 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001842 if (fd < 0) {
1843 return NULL;
1844 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001845
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001846 fp = fdopen(fd, "r");
1847 if (fp == NULL) {
1848 return NULL;
1849 }
1850 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1851 if (tok == NULL) {
1852 fclose(fp);
1853 return NULL;
1854 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001855 if (filename != NULL) {
1856 Py_INCREF(filename);
1857 tok->filename = filename;
1858 }
1859 else {
1860 tok->filename = PyUnicode_FromString("<string>");
1861 if (tok->filename == NULL) {
1862 fclose(fp);
1863 PyTokenizer_Free(tok);
1864 return encoding;
1865 }
1866 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001867 while (tok->lineno < 2 && tok->done == E_OK) {
1868 PyTokenizer_Get(tok, &p_start, &p_end);
1869 }
1870 fclose(fp);
1871 if (tok->encoding) {
1872 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1873 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301874 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001875 }
1876 PyTokenizer_Free(tok);
1877 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001878}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001879
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001880char *
1881PyTokenizer_FindEncoding(int fd)
1882{
1883 return PyTokenizer_FindEncodingFilename(fd, NULL);
1884}
1885
Guido van Rossum408027e1996-12-30 16:17:54 +00001886#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001887
1888void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001889tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001890{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001891 printf("%s", _PyParser_TokenNames[type]);
1892 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1893 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001894}
1895
1896#endif