blob: f82b10299817178049f6515a154600c4f26cf18e [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Serhiy Storchakac6792272013-10-19 21:03:34 +030035extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000036/* Return malloc'ed string including trailing \n;
37 empty malloc'ed string for EOF;
38 NULL if interrupted */
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000044static struct tok_state *tok_new(void);
45static int tok_nextc(struct tok_state *tok);
46static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000047
Brett Cannond5ec98c2007-10-20 02:54:14 +000048
Guido van Rossumdcfcd142019-01-31 03:40:27 -080049/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50 tokenizing. */
51static const char* type_comment_prefix = "# type: ";
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Create and initialize a new tok_state structure */
54
55static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000056tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59 sizeof(struct tok_state));
60 if (tok == NULL)
61 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -060062 tok->buf = tok->cur = tok->inp = NULL;
63 tok->start = NULL;
64 tok->end = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065 tok->done = E_OK;
66 tok->fp = NULL;
67 tok->input = NULL;
68 tok->tabsize = TABSIZE;
69 tok->indent = 0;
70 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000072 tok->atbol = 1;
73 tok->pendin = 0;
74 tok->prompt = tok->nextprompt = NULL;
75 tok->lineno = 0;
76 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->altindstack[0] = 0;
78 tok->decoding_state = STATE_INIT;
79 tok->decoding_erred = 0;
80 tok->read_coding_spec = 0;
81 tok->enc = NULL;
82 tok->encoding = NULL;
83 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020084 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000085 tok->decoding_readline = NULL;
86 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080087 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030088
Guido van Rossum495da292019-03-07 12:38:08 -080089 tok->async_hacks = 0;
90 tok->async_def = 0;
91 tok->async_def_indent = 0;
92 tok->async_def_nl = 0;
93
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000094 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095}
96
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000097static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000099{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700101 if (!result) {
102 tok->done = E_NOMEM;
103 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700105 memcpy(result, s, len);
106 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000108}
109
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000110static char *
111error_ret(struct tok_state *tok) /* XXX */
112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 tok->decoding_erred = 1;
114 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
115 PyMem_FREE(tok->buf);
Andy Lester384f3c52020-02-27 20:44:52 -0600116 tok->buf = tok->cur = tok->inp = NULL;
117 tok->start = NULL;
118 tok->end = NULL;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200119 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121}
122
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000123
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200124static const char *
125get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 char buf[13];
128 int i;
129 for (i = 0; i < 12; i++) {
130 int c = s[i];
131 if (c == '\0')
132 break;
133 else if (c == '_')
134 buf[i] = '-';
135 else
136 buf[i] = tolower(c);
137 }
138 buf[i] = '\0';
139 if (strcmp(buf, "utf-8") == 0 ||
140 strncmp(buf, "utf-8-", 6) == 0)
141 return "utf-8";
142 else if (strcmp(buf, "latin-1") == 0 ||
143 strcmp(buf, "iso-8859-1") == 0 ||
144 strcmp(buf, "iso-latin-1") == 0 ||
145 strncmp(buf, "latin-1-", 8) == 0 ||
146 strncmp(buf, "iso-8859-1-", 11) == 0 ||
147 strncmp(buf, "iso-latin-1-", 12) == 0)
148 return "iso-8859-1";
149 else
150 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000151}
152
153/* Return the coding spec in S, or NULL if none is found. */
154
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155static int
156get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000157{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 /* Coding spec must be in a comment, and that comment must be
161 * the only statement on the source code line. */
162 for (i = 0; i < size - 6; i++) {
163 if (s[i] == '#')
164 break;
165 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 }
168 for (; i < size - 6; i++) { /* XXX inefficient search */
169 const char* t = s + i;
170 if (strncmp(t, "coding", 6) == 0) {
171 const char* begin = NULL;
172 t += 6;
173 if (t[0] != ':' && t[0] != '=')
174 continue;
175 do {
176 t++;
177 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 begin = t;
180 while (Py_ISALNUM(t[0]) ||
181 t[0] == '-' || t[0] == '_' || t[0] == '.')
182 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700185 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200186 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 if (!r)
188 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700189 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 if (r != q) {
191 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 r = new_string(q, strlen(q), tok);
193 if (!r)
194 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700196 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200197 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
199 }
200 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700201 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
204/* Check whether the line contains a coding spec. If it does,
205 invoke the set_readline function for the new encoding.
206 This function receives the tok_state and the new encoding.
207 Return 1 on success, 0 on failure. */
208
209static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000212{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000215
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200220 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700221 if (!get_coding_spec(line, &cs, size, tok))
222 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200223 if (!cs) {
224 Py_ssize_t i;
225 for (i = 0; i < size; i++) {
226 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
227 break;
228 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
229 /* Stop checking coding spec after a line containing
230 * anything except a comment. */
231 tok->read_coding_spec = 1;
232 break;
233 }
234 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700235 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200236 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 tok->read_coding_spec = 1;
238 if (tok->encoding == NULL) {
239 assert(tok->decoding_state == STATE_RAW);
240 if (strcmp(cs, "utf-8") == 0) {
241 tok->encoding = cs;
242 } else {
243 r = set_readline(tok, cs);
244 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700248 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300249 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 "encoding problem: %s", cs);
251 PyMem_FREE(cs);
252 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000253 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700254 } else { /* then, compare cs with BOM */
255 r = (strcmp(tok->encoding, cs) == 0);
256 if (!r)
257 PyErr_Format(PyExc_SyntaxError,
258 "encoding problem: %s with BOM", cs);
259 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262}
263
264/* See whether the file starts with a BOM. If it does,
265 invoke the set_readline function with the new encoding.
266 Return 1 on success, 0 on failure. */
267
268static int
269check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 void unget_char(int, struct tok_state *),
271 int set_readline(struct tok_state *, const char *),
272 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 int ch1, ch2, ch3;
275 ch1 = get_char(tok);
276 tok->decoding_state = STATE_RAW;
277 if (ch1 == EOF) {
278 return 1;
279 } else if (ch1 == 0xEF) {
280 ch2 = get_char(tok);
281 if (ch2 != 0xBB) {
282 unget_char(ch2, tok);
283 unget_char(ch1, tok);
284 return 1;
285 }
286 ch3 = get_char(tok);
287 if (ch3 != 0xBF) {
288 unget_char(ch3, tok);
289 unget_char(ch2, tok);
290 unget_char(ch1, tok);
291 return 1;
292 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000294 /* Disable support for UTF-16 BOMs until a decision
295 is made whether this needs to be supported. */
296 } else if (ch1 == 0xFE) {
297 ch2 = get_char(tok);
298 if (ch2 != 0xFF) {
299 unget_char(ch2, tok);
300 unget_char(ch1, tok);
301 return 1;
302 }
303 if (!set_readline(tok, "utf-16-be"))
304 return 0;
305 tok->decoding_state = STATE_NORMAL;
306 } else if (ch1 == 0xFF) {
307 ch2 = get_char(tok);
308 if (ch2 != 0xFE) {
309 unget_char(ch2, tok);
310 unget_char(ch1, tok);
311 return 1;
312 }
313 if (!set_readline(tok, "utf-16-le"))
314 return 0;
315 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000316#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 } else {
318 unget_char(ch1, tok);
319 return 1;
320 }
321 if (tok->encoding != NULL)
322 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700323 tok->encoding = new_string("utf-8", 5, tok);
324 if (!tok->encoding)
325 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 /* No need to set_readline: input is already utf-8 */
327 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328}
329
330/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000331 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000333 On entry, tok->decoding_buffer will be one of:
334 1) NULL: need to call tok->decoding_readline to get a new line
335 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000336 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000337 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 (in the s buffer) to copy entire contents of the line read
339 by tok->decoding_readline. tok->decoding_buffer has the overflow.
340 In this case, fp_readl is called in a loop (with an expanded buffer)
341 until the buffer ends with a '\n' (or until the end of the file is
342 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344
345static char *
346fp_readl(char *s, int size, struct tok_state *tok)
347{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 PyObject* bufobj;
349 const char *buf;
350 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 /* Ask for one less byte so we can terminate it */
353 assert(size > 0);
354 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 if (tok->decoding_buffer) {
357 bufobj = tok->decoding_buffer;
358 Py_INCREF(bufobj);
359 }
360 else
361 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100362 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000363 if (bufobj == NULL)
364 goto error;
365 }
366 if (PyUnicode_CheckExact(bufobj))
367 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200368 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000369 if (buf == NULL) {
370 goto error;
371 }
372 }
373 else
374 {
375 buf = PyByteArray_AsString(bufobj);
376 if (buf == NULL) {
377 goto error;
378 }
379 buflen = PyByteArray_GET_SIZE(bufobj);
380 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 Py_XDECREF(tok->decoding_buffer);
383 if (buflen > size) {
384 /* Too many chars, the rest goes into tok->decoding_buffer */
385 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
388 goto error;
389 buflen = size;
390 }
391 else
392 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 memcpy(s, buf, buflen);
395 s[buflen] = '\0';
396 if (buflen == 0) /* EOF */
397 s = NULL;
398 Py_DECREF(bufobj);
399 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000400
401error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000402 Py_XDECREF(bufobj);
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404}
405
406/* Set the readline function for TOK to a StreamReader's
407 readline function. The StreamReader is named ENC.
408
409 This function is called from check_bom and check_coding_spec.
410
411 ENC is usually identical to the future value of tok->encoding,
412 except for the (currently unsupported) case of UTF-16.
413
414 Return 1 on success, 0 on failure. */
415
416static int
417fp_setreadl(struct tok_state *tok, const char* enc)
418{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700419 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200420 _Py_IDENTIFIER(open);
421 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000422 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200423 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
Victor Stinner22a351a2010-10-14 12:04:34 +0000425 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200426 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100427 * position of tok->fp. If tok->fp was opened in text mode on Windows,
428 * its file position counts CRLF as one char and can't be directly mapped
429 * to the file offset for fd. Instead we step back one byte and read to
430 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200431 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100432 if (pos == -1 ||
433 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000434 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700435 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000436 }
437
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700438 io = PyImport_ImportModuleNoBlock("io");
439 if (io == NULL)
440 return 0;
441
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200442 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000443 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000445 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700446 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200448 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449 Py_DECREF(stream);
450 if (readline == NULL)
451 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300452 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700453
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100454 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100455 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 if (bufobj == NULL)
457 return 0;
458 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100459 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000460
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462}
463
464/* Fetch the next byte from TOK. */
465
466static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468}
469
470/* Unfetch the last byte back into TOK. */
471
472static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000476/* Check whether the characters at s start a valid
477 UTF-8 sequence. Return the number of characters forming
478 the sequence if yes, 0 if not. */
479static int valid_utf8(const unsigned char* s)
480{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 int expected = 0;
482 int length;
483 if (*s < 0x80)
484 /* single-byte code */
485 return 1;
486 if (*s < 0xc0)
487 /* following byte */
488 return 0;
489 if (*s < 0xE0)
490 expected = 1;
491 else if (*s < 0xF0)
492 expected = 2;
493 else if (*s < 0xF8)
494 expected = 3;
495 else
496 return 0;
497 length = expected + 1;
498 for (; expected; expected--)
499 if (s[expected] < 0x80 || s[expected] >= 0xC0)
500 return 0;
501 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000502}
503
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504/* Read a line of input from TOK. Determine encoding
505 if necessary. */
506
507static char *
508decoding_fgets(char *s, int size, struct tok_state *tok)
509{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 char *line = NULL;
511 int badchar = 0;
512 for (;;) {
513 if (tok->decoding_state == STATE_NORMAL) {
514 /* We already have a codec associated with
515 this input. */
516 line = fp_readl(s, size, tok);
517 break;
518 } else if (tok->decoding_state == STATE_RAW) {
519 /* We want a 'raw' read. */
520 line = Py_UniversalNewlineFgets(s, size,
521 tok->fp, NULL);
522 break;
523 } else {
524 /* We have not yet determined the encoding.
525 If an encoding is found, use the file-pointer
526 reader functions from now on. */
527 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
528 return error_ret(tok);
529 assert(tok->decoding_state != STATE_INIT);
530 }
531 }
532 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
533 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
534 return error_ret(tok);
535 }
536 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 /* The default encoding is UTF-8, so make sure we don't have any
538 non-UTF-8 sequences in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 int length;
542 for (c = (unsigned char *)line; *c; c += length)
543 if (!(length = valid_utf8(c))) {
544 badchar = *c;
545 break;
546 }
547 }
548 if (badchar) {
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200551 PyErr_Format(PyExc_SyntaxError,
552 "Non-UTF-8 code starting with '\\x%.2x' "
553 "in file %U on line %i, "
554 "but no encoding declared; "
555 "see http://python.org/dev/peps/pep-0263/ for details",
556 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 return error_ret(tok);
558 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560}
561
562static int
563decoding_feof(struct tok_state *tok)
564{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000565 if (tok->decoding_state != STATE_NORMAL) {
566 return feof(tok->fp);
567 } else {
568 PyObject* buf = tok->decoding_buffer;
569 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100570 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 if (buf == NULL) {
572 error_ret(tok);
573 return 1;
574 } else {
575 tok->decoding_buffer = buf;
576 }
577 }
578 return PyObject_Length(buf) == 0;
579 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580}
581
582/* Fetch a byte from TOK, using the string buffer. */
583
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000584static int
585buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587}
588
589/* Unfetch a byte from TOK, using the string buffer. */
590
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000591static void
592buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000593 tok->str--;
594 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595}
596
597/* Set the readline function for TOK to ENC. For the string-based
598 tokenizer, this means to just record the encoding. */
599
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000600static int
601buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 tok->enc = enc;
603 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604}
605
606/* Return a UTF-8 encoding Python string object from the
607 C byte string STR, which is encoded with ENC. */
608
609static PyObject *
610translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000611 PyObject *utf8;
612 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
613 if (buf == NULL)
614 return NULL;
615 utf8 = PyUnicode_AsUTF8String(buf);
616 Py_DECREF(buf);
617 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000620
621static char *
622translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200623 int skip_next_lf = 0;
624 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 char *buf, *current;
626 char c = '\0';
627 buf = PyMem_MALLOC(needed_length);
628 if (buf == NULL) {
629 tok->done = E_NOMEM;
630 return NULL;
631 }
632 for (current = buf; *s; s++, current++) {
633 c = *s;
634 if (skip_next_lf) {
635 skip_next_lf = 0;
636 if (c == '\n') {
637 c = *++s;
638 if (!c)
639 break;
640 }
641 }
642 if (c == '\r') {
643 skip_next_lf = 1;
644 c = '\n';
645 }
646 *current = c;
647 }
648 /* If this is exec input, add a newline to the end of the string if
649 there isn't one already. */
650 if (exec_input && c != '\n') {
651 *current = '\n';
652 current++;
653 }
654 *current = '\0';
655 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000656 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 /* should never fail */
Pablo Galindocb90c892019-03-19 17:17:58 +0000658 char* result = PyMem_REALLOC(buf, final_length);
659 if (result == NULL) {
660 PyMem_FREE(buf);
661 }
662 buf = result;
663 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000665}
666
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667/* Decode a byte string STR for use as the buffer of TOK.
668 Look for encoding declarations inside STR, and record them
669 inside TOK. */
670
Andy Lester384f3c52020-02-27 20:44:52 -0600671static char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000672decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000673{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 PyObject* utf8 = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600675 char *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 const char *s;
677 const char *newl[2] = {NULL, NULL};
678 int lineno = 0;
679 tok->input = str = translate_newlines(input, single, tok);
680 if (str == NULL)
681 return NULL;
682 tok->enc = NULL;
683 tok->str = str;
684 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
685 return error_ret(tok);
686 str = tok->str; /* string after BOM if any */
687 assert(str);
688 if (tok->enc != NULL) {
689 utf8 = translate_into_utf8(str, tok->enc);
690 if (utf8 == NULL)
691 return error_ret(tok);
692 str = PyBytes_AsString(utf8);
693 }
694 for (s = str;; s++) {
695 if (*s == '\0') break;
696 else if (*s == '\n') {
697 assert(lineno < 2);
698 newl[lineno] = s;
699 lineno++;
700 if (lineno == 2) break;
701 }
702 }
703 tok->enc = NULL;
704 /* need to check line 1 and 2 separately since check_coding_spec
705 assumes a single line as input */
706 if (newl[0]) {
707 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200709 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711 tok, buf_setreadl))
712 return error_ret(tok);
713 }
714 }
715 if (tok->enc != NULL) {
716 assert(utf8 == NULL);
717 utf8 = translate_into_utf8(str, tok->enc);
718 if (utf8 == NULL)
719 return error_ret(tok);
720 str = PyBytes_AS_STRING(utf8);
721 }
722 assert(tok->decoding_buffer == NULL);
723 tok->decoding_buffer = utf8; /* CAUTION */
724 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000725}
726
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727/* Set up tokenizer for string */
728
729struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000730PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600733 char *decoded;
734
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 if (tok == NULL)
736 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600737 decoded = decode_str(str, exec_input, tok);
738 if (decoded == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyTokenizer_Free(tok);
740 return NULL;
741 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000742
Andy Lester384f3c52020-02-27 20:44:52 -0600743 tok->buf = tok->cur = tok->inp = decoded;
744 tok->end = decoded;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746}
747
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000748struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000749PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000750{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 struct tok_state *tok = tok_new();
Andy Lester384f3c52020-02-27 20:44:52 -0600752 char *translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 if (tok == NULL)
754 return NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600755 tok->input = translated = translate_newlines(str, exec_input, tok);
756 if (translated == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 PyTokenizer_Free(tok);
758 return NULL;
759 }
760 tok->decoding_state = STATE_RAW;
761 tok->read_coding_spec = 1;
762 tok->enc = NULL;
Andy Lester384f3c52020-02-27 20:44:52 -0600763 tok->str = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 tok->encoding = (char *)PyMem_MALLOC(6);
765 if (!tok->encoding) {
766 PyTokenizer_Free(tok);
767 return NULL;
768 }
769 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000770
Andy Lester384f3c52020-02-27 20:44:52 -0600771 tok->buf = tok->cur = tok->inp = translated;
772 tok->end = translated;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000774}
775
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000776/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777
778struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300779PyTokenizer_FromFile(FILE *fp, const char* enc,
780 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 struct tok_state *tok = tok_new();
783 if (tok == NULL)
784 return NULL;
785 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
786 PyTokenizer_Free(tok);
787 return NULL;
788 }
789 tok->cur = tok->inp = tok->buf;
790 tok->end = tok->buf + BUFSIZ;
791 tok->fp = fp;
792 tok->prompt = ps1;
793 tok->nextprompt = ps2;
794 if (enc != NULL) {
795 /* Must copy encoding declaration since it
796 gets copied into the parse tree. */
797 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
798 if (!tok->encoding) {
799 PyTokenizer_Free(tok);
800 return NULL;
801 }
802 strcpy(tok->encoding, enc);
803 tok->decoding_state = STATE_NORMAL;
804 }
805 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806}
807
808
809/* Free a tok_state structure */
810
811void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000812PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 if (tok->encoding != NULL)
815 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 Py_XDECREF(tok->decoding_readline);
817 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200818 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 if (tok->fp != NULL && tok->buf != NULL)
820 PyMem_FREE(tok->buf);
821 if (tok->input)
Andy Lester384f3c52020-02-27 20:44:52 -0600822 PyMem_FREE(tok->input);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824}
825
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826/* Get next char, updating state; error code goes into tok->done */
827
828static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200829tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000830{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 for (;;) {
832 if (tok->cur != tok->inp) {
833 return Py_CHARMASK(*tok->cur++); /* Fast path */
834 }
835 if (tok->done != E_OK)
836 return EOF;
837 if (tok->fp == NULL) {
838 char *end = strchr(tok->inp, '\n');
839 if (end != NULL)
840 end++;
841 else {
842 end = strchr(tok->inp, '\0');
843 if (end == tok->inp) {
844 tok->done = E_EOF;
845 return EOF;
846 }
847 }
848 if (tok->start == NULL)
849 tok->buf = tok->cur;
850 tok->line_start = tok->cur;
851 tok->lineno++;
852 tok->inp = end;
853 return Py_CHARMASK(*tok->cur++);
854 }
855 if (tok->prompt != NULL) {
856 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000857 if (newtok != NULL) {
858 char *translated = translate_newlines(newtok, 0, tok);
859 PyMem_FREE(newtok);
860 if (translated == NULL)
861 return EOF;
862 newtok = translated;
863 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 if (tok->encoding && newtok && *newtok) {
865 /* Recode to UTF-8 */
866 Py_ssize_t buflen;
867 const char* buf;
868 PyObject *u = translate_into_utf8(newtok, tok->encoding);
869 PyMem_FREE(newtok);
870 if (!u) {
871 tok->done = E_DECODE;
872 return EOF;
873 }
874 buflen = PyBytes_GET_SIZE(u);
875 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700877 if (newtok == NULL) {
878 Py_DECREF(u);
879 tok->done = E_NOMEM;
880 return EOF;
881 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 strcpy(newtok, buf);
883 Py_DECREF(u);
884 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 if (tok->nextprompt != NULL)
886 tok->prompt = tok->nextprompt;
887 if (newtok == NULL)
888 tok->done = E_INTR;
889 else if (*newtok == '\0') {
890 PyMem_FREE(newtok);
891 tok->done = E_EOF;
892 }
893 else if (tok->start != NULL) {
894 size_t start = tok->start - tok->buf;
895 size_t oldlen = tok->cur - tok->buf;
896 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000897 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 char *buf = tok->buf;
899 buf = (char *)PyMem_REALLOC(buf, newlen+1);
900 tok->lineno++;
901 if (buf == NULL) {
902 PyMem_FREE(tok->buf);
903 tok->buf = NULL;
904 PyMem_FREE(newtok);
905 tok->done = E_NOMEM;
906 return EOF;
907 }
908 tok->buf = buf;
909 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000910 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 tok->line_start = tok->cur;
912 strcpy(tok->buf + oldlen, newtok);
913 PyMem_FREE(newtok);
914 tok->inp = tok->buf + newlen;
915 tok->end = tok->inp + 1;
916 tok->start = tok->buf + start;
917 }
918 else {
919 tok->lineno++;
920 if (tok->buf != NULL)
921 PyMem_FREE(tok->buf);
922 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 tok->cur = tok->buf;
924 tok->line_start = tok->buf;
925 tok->inp = strchr(tok->buf, '\0');
926 tok->end = tok->inp + 1;
927 }
928 }
929 else {
930 int done = 0;
931 Py_ssize_t cur = 0;
932 char *pt;
933 if (tok->start == NULL) {
934 if (tok->buf == NULL) {
935 tok->buf = (char *)
936 PyMem_MALLOC(BUFSIZ);
937 if (tok->buf == NULL) {
938 tok->done = E_NOMEM;
939 return EOF;
940 }
941 tok->end = tok->buf + BUFSIZ;
942 }
943 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
944 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200945 if (!tok->decoding_erred)
946 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000947 done = 1;
948 }
949 else {
950 tok->done = E_OK;
951 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700952 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 }
954 }
955 else {
956 cur = tok->cur - tok->buf;
957 if (decoding_feof(tok)) {
958 tok->done = E_EOF;
959 done = 1;
960 }
961 else
962 tok->done = E_OK;
963 }
964 tok->lineno++;
965 /* Read until '\n' or EOF */
966 while (!done) {
967 Py_ssize_t curstart = tok->start == NULL ? -1 :
968 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700969 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 Py_ssize_t curvalid = tok->inp - tok->buf;
971 Py_ssize_t newsize = curvalid + BUFSIZ;
972 char *newbuf = tok->buf;
973 newbuf = (char *)PyMem_REALLOC(newbuf,
974 newsize);
975 if (newbuf == NULL) {
976 tok->done = E_NOMEM;
977 tok->cur = tok->inp;
978 return EOF;
979 }
980 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200981 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700982 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200983 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000984 tok->inp = tok->buf + curvalid;
985 tok->end = tok->buf + newsize;
986 tok->start = curstart < 0 ? NULL :
987 tok->buf + curstart;
988 if (decoding_fgets(tok->inp,
989 (int)(tok->end - tok->inp),
990 tok) == NULL) {
991 /* Break out early on decoding
992 errors, as tok->buf will be NULL
993 */
994 if (tok->decoding_erred)
995 return EOF;
996 /* Last line does not end in \n,
997 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -0700998 if (tok->inp[-1] != '\n')
999 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 }
1001 tok->inp = strchr(tok->inp, '\0');
1002 done = tok->inp[-1] == '\n';
1003 }
1004 if (tok->buf != NULL) {
1005 tok->cur = tok->buf + cur;
1006 tok->line_start = tok->cur;
1007 /* replace "\r\n" with "\n" */
1008 /* For Mac leave the \r, giving a syntax error */
1009 pt = tok->inp - 2;
1010 if (pt >= tok->buf && *pt == '\r') {
1011 *pt++ = '\n';
1012 *pt = '\0';
1013 tok->inp = pt;
1014 }
1015 }
1016 }
1017 if (tok->done != E_OK) {
1018 if (tok->prompt != NULL)
1019 PySys_WriteStderr("\n");
1020 tok->cur = tok->inp;
1021 return EOF;
1022 }
1023 }
1024 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025}
1026
1027
1028/* Back-up one character */
1029
1030static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001031tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 if (c != EOF) {
1034 if (--tok->cur < tok->buf)
1035 Py_FatalError("tok_backup: beginning of buffer");
1036 if (*tok->cur != c)
1037 *tok->cur = c;
1038 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039}
1040
1041
Guido van Rossum926f13a1998-04-09 21:38:06 +00001042static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001043syntaxerror(struct tok_state *tok, const char *format, ...)
1044{
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001045 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001046 va_list vargs;
1047#ifdef HAVE_STDARG_PROTOTYPES
1048 va_start(vargs, format);
1049#else
1050 va_start(vargs);
1051#endif
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001052 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001053 va_end(vargs);
Serhiy Storchaka0cc6b5e2020-02-12 12:17:00 +02001054 if (!errmsg) {
1055 goto error;
1056 }
1057
1058 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1059 "replace");
1060 if (!errtext) {
1061 goto error;
1062 }
1063 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1064 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1065 if (line_len != tok->cur - tok->line_start) {
1066 Py_DECREF(errtext);
1067 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1068 "replace");
1069 }
1070 if (!errtext) {
1071 goto error;
1072 }
1073
1074 args = Py_BuildValue("(O(OiiN))", errmsg,
1075 tok->filename, tok->lineno, offset, errtext);
1076 if (args) {
1077 PyErr_SetObject(PyExc_SyntaxError, args);
1078 Py_DECREF(args);
1079 }
1080
1081error:
1082 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001083 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001084 return ERRORTOKEN;
1085}
1086
1087static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001088indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001089{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001090 tok->done = E_TABSPACE;
1091 tok->cur = tok->inp;
1092 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001093}
1094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095/* Verify that the identifier follows PEP 3131.
1096 All identifier strings are guaranteed to be "ready" unicode objects.
1097 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001098static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001099verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001100{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 PyObject *s;
1102 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001103 if (tok->decoding_erred)
1104 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001106 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1108 PyErr_Clear();
1109 tok->done = E_IDENTIFIER;
1110 } else {
1111 tok->done = E_ERROR;
1112 }
1113 return 0;
1114 }
1115 result = PyUnicode_IsIdentifier(s);
1116 Py_DECREF(s);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001117 if (result == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 tok->done = E_IDENTIFIER;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +01001119 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001121}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001122
Brett Cannona721aba2016-09-09 14:57:09 -07001123static int
1124tok_decimal_tail(struct tok_state *tok)
1125{
1126 int c;
1127
1128 while (1) {
1129 do {
1130 c = tok_nextc(tok);
1131 } while (isdigit(c));
1132 if (c != '_') {
1133 break;
1134 }
1135 c = tok_nextc(tok);
1136 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001137 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001138 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001139 return 0;
1140 }
1141 }
1142 return c;
1143}
1144
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001145/* Get next token, after space stripping etc. */
1146
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001147static int
Andy Lester384f3c52020-02-27 20:44:52 -06001148tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001150 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001152
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 tok->start = NULL;
1156 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001157
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001158 /* Get indentation level */
1159 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001160 int col = 0;
1161 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 tok->atbol = 0;
1163 for (;;) {
1164 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001165 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001167 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001169 col = (col / tok->tabsize + 1) * tok->tabsize;
1170 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 }
Brett Cannona721aba2016-09-09 14:57:09 -07001172 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001174 }
1175 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001176 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001177 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 }
1179 tok_backup(tok, c);
1180 if (c == '#' || c == '\n') {
1181 /* Lines with only whitespace and/or comments
1182 shouldn't affect the indentation and are
1183 not passed to the parser as NEWLINE tokens,
1184 except *totally* empty lines in interactive
1185 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001186 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001187 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001188 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001189 else if (tok->prompt != NULL && tok->lineno == 1) {
1190 /* In interactive mode, if the first line contains
1191 only spaces and/or a comment, let it through. */
1192 blankline = 0;
1193 col = altcol = 0;
1194 }
Brett Cannona721aba2016-09-09 14:57:09 -07001195 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001197 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 /* We can't jump back right here since we still
1199 may need to skip to the end of a comment */
1200 }
1201 if (!blankline && tok->level == 0) {
1202 if (col == tok->indstack[tok->indent]) {
1203 /* No change */
1204 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001205 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 }
1207 }
1208 else if (col > tok->indstack[tok->indent]) {
1209 /* Indent -- always one */
1210 if (tok->indent+1 >= MAXINDENT) {
1211 tok->done = E_TOODEEP;
1212 tok->cur = tok->inp;
1213 return ERRORTOKEN;
1214 }
1215 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001216 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 }
1218 tok->pendin++;
1219 tok->indstack[++tok->indent] = col;
1220 tok->altindstack[tok->indent] = altcol;
1221 }
1222 else /* col < tok->indstack[tok->indent] */ {
1223 /* Dedent -- any number, must be consistent */
1224 while (tok->indent > 0 &&
1225 col < tok->indstack[tok->indent]) {
1226 tok->pendin--;
1227 tok->indent--;
1228 }
1229 if (col != tok->indstack[tok->indent]) {
1230 tok->done = E_DEDENT;
1231 tok->cur = tok->inp;
1232 return ERRORTOKEN;
1233 }
1234 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001235 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001236 }
1237 }
1238 }
1239 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001240
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001241 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001242
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 /* Return pending indents/dedents */
1244 if (tok->pendin != 0) {
1245 if (tok->pendin < 0) {
1246 tok->pendin++;
1247 return DEDENT;
1248 }
1249 else {
1250 tok->pendin--;
1251 return INDENT;
1252 }
1253 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001254
Guido van Rossum495da292019-03-07 12:38:08 -08001255 /* Peek ahead at the next character */
1256 c = tok_nextc(tok);
1257 tok_backup(tok, c);
1258 /* Check if we are closing an async function */
1259 if (tok->async_def
1260 && !blankline
1261 /* Due to some implementation artifacts of type comments,
1262 * a TYPE_COMMENT at the start of a function won't set an
1263 * indentation level and it will produce a NEWLINE after it.
1264 * To avoid spuriously ending an async function due to this,
1265 * wait until we have some non-newline char in front of us. */
1266 && c != '\n'
1267 && tok->level == 0
1268 /* There was a NEWLINE after ASYNC DEF,
1269 so we're past the signature. */
1270 && tok->async_def_nl
1271 /* Current indentation level is less than where
1272 the async function was defined */
1273 && tok->async_def_indent >= tok->indent)
1274 {
1275 tok->async_def = 0;
1276 tok->async_def_indent = 0;
1277 tok->async_def_nl = 0;
1278 }
1279
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001281 tok->start = NULL;
1282 /* Skip spaces */
1283 do {
1284 c = tok_nextc(tok);
1285 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001286
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 /* Set start of current token */
1288 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001289
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001290 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001291 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001292 const char *prefix, *p, *type_start;
1293
Brett Cannona721aba2016-09-09 14:57:09 -07001294 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001296 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001297
1298 if (tok->type_comments) {
1299 p = tok->start;
1300 prefix = type_comment_prefix;
1301 while (*prefix && p < tok->cur) {
1302 if (*prefix == ' ') {
1303 while (*p == ' ' || *p == '\t') {
1304 p++;
1305 }
1306 } else if (*prefix == *p) {
1307 p++;
1308 } else {
1309 break;
1310 }
1311
1312 prefix++;
1313 }
1314
1315 /* This is a type comment if we matched all of type_comment_prefix. */
1316 if (!*prefix) {
1317 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001318 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001319 tok_backup(tok, c); /* don't eat the newline or EOF */
1320
1321 type_start = p;
1322
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001323 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001324 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001325 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001326 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001327 && !(tok->cur > ignore_end
1328 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001329
1330 if (is_type_ignore) {
Andy Lester384f3c52020-02-27 20:44:52 -06001331 *p_start = ignore_end;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001332 *p_end = tok->cur;
1333
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001334 /* If this type ignore is the only thing on the line, consume the newline also. */
1335 if (blankline) {
1336 tok_nextc(tok);
1337 tok->atbol = 1;
1338 }
1339 return TYPE_IGNORE;
1340 } else {
Andy Lester384f3c52020-02-27 20:44:52 -06001341 *p_start = type_start; /* after type_comment_prefix */
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001342 *p_end = tok->cur;
1343 return TYPE_COMMENT;
1344 }
1345 }
1346 }
Brett Cannona721aba2016-09-09 14:57:09 -07001347 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001348
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 /* Check for EOF and errors now */
1350 if (c == EOF) {
1351 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1352 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001353
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354 /* Identifier (most frequent token!) */
1355 nonascii = 0;
1356 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001357 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001358 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001359 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001360 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001361 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001362 /* Since this is a backwards compatibility support literal we don't
1363 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001364 else if (!(saw_b || saw_u || saw_r || saw_f)
1365 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001366 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001367 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001368 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001369 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001370 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001371 }
1372 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001373 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001374 }
1375 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001376 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001377 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001379 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001381 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 }
1383 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001384 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001386 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 c = tok_nextc(tok);
1388 }
1389 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001390 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001392 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 *p_start = tok->start;
1394 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001395
Guido van Rossum495da292019-03-07 12:38:08 -08001396 /* async/await parsing block. */
1397 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1398 /* May be an 'async' or 'await' token. For Python 3.7 or
1399 later we recognize them unconditionally. For Python
1400 3.5 or 3.6 we recognize 'async' in front of 'def', and
1401 either one inside of 'async def'. (Technically we
1402 shouldn't recognize these at all for 3.4 or earlier,
1403 but there's no *valid* Python 3.4 code that would be
1404 rejected, and async functions will be rejected in a
1405 later phase.) */
1406 if (!tok->async_hacks || tok->async_def) {
1407 /* Always recognize the keywords. */
1408 if (memcmp(tok->start, "async", 5) == 0) {
1409 return ASYNC;
1410 }
1411 if (memcmp(tok->start, "await", 5) == 0) {
1412 return AWAIT;
1413 }
1414 }
1415 else if (memcmp(tok->start, "async", 5) == 0) {
1416 /* The current token is 'async'.
1417 Look ahead one token to see if that is 'def'. */
1418
1419 struct tok_state ahead_tok;
Andy Lester384f3c52020-02-27 20:44:52 -06001420 const char *ahead_tok_start = NULL;
1421 const char *ahead_tok_end = NULL;
Guido van Rossum495da292019-03-07 12:38:08 -08001422 int ahead_tok_kind;
1423
1424 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1425 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1426 &ahead_tok_end);
1427
1428 if (ahead_tok_kind == NAME
1429 && ahead_tok.cur - ahead_tok.start == 3
1430 && memcmp(ahead_tok.start, "def", 3) == 0)
1431 {
1432 /* The next token is going to be 'def', so instead of
1433 returning a plain NAME token, return ASYNC. */
1434 tok->async_def_indent = tok->indent;
1435 tok->async_def = 1;
1436 return ASYNC;
1437 }
1438 }
1439 }
1440
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 return NAME;
1442 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 /* Newline */
1445 if (c == '\n') {
1446 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001447 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001448 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001449 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 *p_start = tok->start;
1451 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1452 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001453 if (tok->async_def) {
1454 /* We're somewhere inside an 'async def' function, and
1455 we've encountered a NEWLINE after its signature. */
1456 tok->async_def_nl = 1;
1457 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001458 return NEWLINE;
1459 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 /* Period or number starting with period? */
1462 if (c == '.') {
1463 c = tok_nextc(tok);
1464 if (isdigit(c)) {
1465 goto fraction;
1466 } else if (c == '.') {
1467 c = tok_nextc(tok);
1468 if (c == '.') {
1469 *p_start = tok->start;
1470 *p_end = tok->cur;
1471 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001472 }
1473 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001474 tok_backup(tok, c);
1475 }
1476 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001477 }
1478 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001479 tok_backup(tok, c);
1480 }
1481 *p_start = tok->start;
1482 *p_end = tok->cur;
1483 return DOT;
1484 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001485
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 /* Number */
1487 if (isdigit(c)) {
1488 if (c == '0') {
1489 /* Hex, octal or binary -- maybe. */
1490 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001492 /* Hex */
1493 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001494 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001495 if (c == '_') {
1496 c = tok_nextc(tok);
1497 }
1498 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001499 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001500 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001501 }
1502 do {
1503 c = tok_nextc(tok);
1504 } while (isxdigit(c));
1505 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 }
1507 else if (c == 'o' || c == 'O') {
1508 /* Octal */
1509 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001510 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001511 if (c == '_') {
1512 c = tok_nextc(tok);
1513 }
1514 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001515 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001516 if (isdigit(c)) {
1517 return syntaxerror(tok,
1518 "invalid digit '%c' in octal literal", c);
1519 }
1520 else {
1521 return syntaxerror(tok, "invalid octal literal");
1522 }
Brett Cannona721aba2016-09-09 14:57:09 -07001523 }
1524 do {
1525 c = tok_nextc(tok);
1526 } while ('0' <= c && c < '8');
1527 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001528 if (isdigit(c)) {
1529 return syntaxerror(tok,
1530 "invalid digit '%c' in octal literal", c);
1531 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 }
1533 else if (c == 'b' || c == 'B') {
1534 /* Binary */
1535 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001536 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001537 if (c == '_') {
1538 c = tok_nextc(tok);
1539 }
1540 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001541 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001542 if (isdigit(c)) {
1543 return syntaxerror(tok,
1544 "invalid digit '%c' in binary literal", c);
1545 }
1546 else {
1547 return syntaxerror(tok, "invalid binary literal");
1548 }
Brett Cannona721aba2016-09-09 14:57:09 -07001549 }
1550 do {
1551 c = tok_nextc(tok);
1552 } while (c == '0' || c == '1');
1553 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001554 if (isdigit(c)) {
1555 return syntaxerror(tok,
1556 "invalid digit '%c' in binary literal", c);
1557 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 }
1559 else {
1560 int nonzero = 0;
1561 /* maybe old-style octal; c is first char of it */
1562 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001563 while (1) {
1564 if (c == '_') {
1565 c = tok_nextc(tok);
1566 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001567 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001568 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001569 }
1570 }
1571 if (c != '0') {
1572 break;
1573 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001574 c = tok_nextc(tok);
1575 }
Brett Cannona721aba2016-09-09 14:57:09 -07001576 if (isdigit(c)) {
1577 nonzero = 1;
1578 c = tok_decimal_tail(tok);
1579 if (c == 0) {
1580 return ERRORTOKEN;
1581 }
1582 }
1583 if (c == '.') {
1584 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001586 }
1587 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001588 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001589 }
1590 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001592 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001594 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001596 return syntaxerror(tok,
1597 "leading zeros in decimal integer "
1598 "literals are not permitted; "
1599 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 }
1601 }
1602 }
1603 else {
1604 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001605 c = tok_decimal_tail(tok);
1606 if (c == 0) {
1607 return ERRORTOKEN;
1608 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 {
1610 /* Accept floating point numbers. */
1611 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001612 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 fraction:
1614 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001615 if (isdigit(c)) {
1616 c = tok_decimal_tail(tok);
1617 if (c == 0) {
1618 return ERRORTOKEN;
1619 }
1620 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001621 }
1622 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001623 int e;
1624 exponent:
1625 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 /* Exponent part */
1627 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001628 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001630 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001631 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001632 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001633 }
1634 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001636 tok_backup(tok, e);
1637 *p_start = tok->start;
1638 *p_end = tok->cur;
1639 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001640 }
Brett Cannona721aba2016-09-09 14:57:09 -07001641 c = tok_decimal_tail(tok);
1642 if (c == 0) {
1643 return ERRORTOKEN;
1644 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 }
Brett Cannona721aba2016-09-09 14:57:09 -07001646 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 /* Imaginary part */
1648 imaginary:
1649 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001650 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 }
1652 }
1653 tok_backup(tok, c);
1654 *p_start = tok->start;
1655 *p_end = tok->cur;
1656 return NUMBER;
1657 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001658
1659 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 /* String */
1661 if (c == '\'' || c == '"') {
1662 int quote = c;
1663 int quote_size = 1; /* 1 or 3 */
1664 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001665
Anthony Sottile995d9b92019-01-12 20:05:13 -08001666 /* Nodes of type STRING, especially multi line strings
1667 must be handled differently in order to get both
1668 the starting line number and the column offset right.
1669 (cf. issue 16806) */
1670 tok->first_lineno = tok->lineno;
1671 tok->multi_line_start = tok->line_start;
1672
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 /* Find the quote size and start of string */
1674 c = tok_nextc(tok);
1675 if (c == quote) {
1676 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001677 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001679 }
1680 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001682 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 }
Brett Cannona721aba2016-09-09 14:57:09 -07001684 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001686 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001687
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 /* Get rest of string */
1689 while (end_quote_size != quote_size) {
1690 c = tok_nextc(tok);
1691 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001692 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001694 }
1695 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001697 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 tok->cur = tok->inp;
1699 return ERRORTOKEN;
1700 }
1701 if (quote_size == 1 && c == '\n') {
1702 tok->done = E_EOLS;
1703 tok->cur = tok->inp;
1704 return ERRORTOKEN;
1705 }
Brett Cannona721aba2016-09-09 14:57:09 -07001706 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001708 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001709 else {
1710 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001711 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001712 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001713 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 }
1715 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001716
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 *p_start = tok->start;
1718 *p_end = tok->cur;
1719 return STRING;
1720 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001721
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 /* Line continuation */
1723 if (c == '\\') {
1724 c = tok_nextc(tok);
1725 if (c != '\n') {
1726 tok->done = E_LINECONT;
1727 tok->cur = tok->inp;
1728 return ERRORTOKEN;
1729 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001730 c = tok_nextc(tok);
1731 if (c == EOF) {
1732 tok->done = E_EOF;
1733 tok->cur = tok->inp;
1734 return ERRORTOKEN;
1735 } else {
1736 tok_backup(tok, c);
1737 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 tok->cont_line = 1;
1739 goto again; /* Read next line */
1740 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001741
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 /* Check for two-character token */
1743 {
1744 int c2 = tok_nextc(tok);
1745 int token = PyToken_TwoChars(c, c2);
1746 if (token != OP) {
1747 int c3 = tok_nextc(tok);
1748 int token3 = PyToken_ThreeChars(c, c2, c3);
1749 if (token3 != OP) {
1750 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001751 }
1752 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 tok_backup(tok, c3);
1754 }
1755 *p_start = tok->start;
1756 *p_end = tok->cur;
1757 return token;
1758 }
1759 tok_backup(tok, c2);
1760 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001761
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001762 /* Keep track of parentheses nesting level */
1763 switch (c) {
1764 case '(':
1765 case '[':
1766 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001767 if (tok->level >= MAXLEVEL) {
1768 return syntaxerror(tok, "too many nested parentheses");
1769 }
1770 tok->parenstack[tok->level] = c;
1771 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 tok->level++;
1773 break;
1774 case ')':
1775 case ']':
1776 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001777 if (!tok->level) {
1778 return syntaxerror(tok, "unmatched '%c'", c);
1779 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001780 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001781 int opening = tok->parenstack[tok->level];
1782 if (!((opening == '(' && c == ')') ||
1783 (opening == '[' && c == ']') ||
1784 (opening == '{' && c == '}')))
1785 {
1786 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1787 return syntaxerror(tok,
1788 "closing parenthesis '%c' does not match "
1789 "opening parenthesis '%c' on line %d",
1790 c, opening, tok->parenlinenostack[tok->level]);
1791 }
1792 else {
1793 return syntaxerror(tok,
1794 "closing parenthesis '%c' does not match "
1795 "opening parenthesis '%c'",
1796 c, opening);
1797 }
1798 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001799 break;
1800 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001801
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 /* Punctuation character */
1803 *p_start = tok->start;
1804 *p_end = tok->cur;
1805 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001806}
1807
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001808int
Andy Lester384f3c52020-02-27 20:44:52 -06001809PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001810{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 int result = tok_get(tok, p_start, p_end);
1812 if (tok->decoding_erred) {
1813 result = ERRORTOKEN;
1814 tok->done = E_DECODE;
1815 }
1816 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001817}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001818
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001819/* Get the encoding of a Python file. Check for the coding cookie and check if
1820 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001821
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001822 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1823 encoding in the first or second line of the file (in which case the encoding
1824 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001825
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001826 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1827 by the caller. */
1828
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001829char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001830PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001831{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001832 struct tok_state *tok;
1833 FILE *fp;
Andy Lester384f3c52020-02-27 20:44:52 -06001834 const char *p_start = NULL;
1835 const char *p_end = NULL;
1836 char *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001837
Victor Stinnerdaf45552013-08-28 00:53:59 +02001838 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 if (fd < 0) {
1840 return NULL;
1841 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001842
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001843 fp = fdopen(fd, "r");
1844 if (fp == NULL) {
1845 return NULL;
1846 }
1847 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1848 if (tok == NULL) {
1849 fclose(fp);
1850 return NULL;
1851 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001852 if (filename != NULL) {
1853 Py_INCREF(filename);
1854 tok->filename = filename;
1855 }
1856 else {
1857 tok->filename = PyUnicode_FromString("<string>");
1858 if (tok->filename == NULL) {
1859 fclose(fp);
1860 PyTokenizer_Free(tok);
1861 return encoding;
1862 }
1863 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001864 while (tok->lineno < 2 && tok->done == E_OK) {
1865 PyTokenizer_Get(tok, &p_start, &p_end);
1866 }
1867 fclose(fp);
1868 if (tok->encoding) {
1869 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1870 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301871 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001872 }
1873 PyTokenizer_Free(tok);
1874 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001875}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001876
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001877char *
1878PyTokenizer_FindEncoding(int fd)
1879{
1880 return PyTokenizer_FindEncodingFilename(fd, NULL);
1881}
1882
Guido van Rossum408027e1996-12-30 16:17:54 +00001883#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001884
1885void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001886tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001887{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001888 printf("%s", _PyParser_TokenNames[type]);
1889 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1890 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001891}
1892
1893#endif