blob: aecbcebb917e8370327940e7a98b40684ad5d303 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Miss Islington (bot)efd878c2020-02-12 02:35:10 -08004#define PY_SSIZE_T_CLEAN
Jack Jansen7b8c7542002-04-14 20:12:41 +00005#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00006
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000014#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000015#include "fileobject.h"
16#include "codecs.h"
17#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000018
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080019/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
Martin v. Löwis5b222132007-06-10 09:51:05 +000022#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000027
28#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000034
Serhiy Storchakac6792272013-10-19 21:03:34 +030035extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000036/* Return malloc'ed string including trailing \n;
37 empty malloc'ed string for EOF;
38 NULL if interrupted */
39
Guido van Rossum4fe87291992-02-26 15:24:44 +000040/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042
Guido van Rossum3f5da241990-12-20 15:06:42 +000043/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000044static struct tok_state *tok_new(void);
45static int tok_nextc(struct tok_state *tok);
46static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000047
Brett Cannond5ec98c2007-10-20 02:54:14 +000048
Guido van Rossumdcfcd142019-01-31 03:40:27 -080049/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50 tokenizing. */
51static const char* type_comment_prefix = "# type: ";
52
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Create and initialize a new tok_state structure */
54
55static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000056tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000057{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000058 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59 sizeof(struct tok_state));
60 if (tok == NULL)
61 return NULL;
62 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
63 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040069
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 tok->atbol = 1;
71 tok->pendin = 0;
72 tok->prompt = tok->nextprompt = NULL;
73 tok->lineno = 0;
74 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 tok->altindstack[0] = 0;
76 tok->decoding_state = STATE_INIT;
77 tok->decoding_erred = 0;
78 tok->read_coding_spec = 0;
79 tok->enc = NULL;
80 tok->encoding = NULL;
81 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020082 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 tok->decoding_readline = NULL;
84 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080085 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030086
Guido van Rossum495da292019-03-07 12:38:08 -080087 tok->async_hacks = 0;
88 tok->async_def = 0;
89 tok->async_def_indent = 0;
90 tok->async_def_nl = 0;
91
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000092 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000093}
94
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000095static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070096new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000097{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070099 if (!result) {
100 tok->done = E_NOMEM;
101 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700103 memcpy(result, s, len);
104 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000105 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000106}
107
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000108static char *
109error_ret(struct tok_state *tok) /* XXX */
110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000111 tok->decoding_erred = 1;
112 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
113 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200114 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000116 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000117}
118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000153{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700155 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (strncmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700181 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200182 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 if (!r)
184 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700185 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (r != q) {
187 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700192 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200193 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 }
196 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000208{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000211
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200212 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200214 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200216 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700217 if (!get_coding_spec(line, &cs, size, tok))
218 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200219 if (!cs) {
220 Py_ssize_t i;
221 for (i = 0; i < size; i++) {
222 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223 break;
224 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225 /* Stop checking coding spec after a line containing
226 * anything except a comment. */
227 tok->read_coding_spec = 1;
228 break;
229 }
230 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700231 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200232 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 tok->read_coding_spec = 1;
234 if (tok->encoding == NULL) {
235 assert(tok->decoding_state == STATE_RAW);
236 if (strcmp(cs, "utf-8") == 0) {
237 tok->encoding = cs;
238 } else {
239 r = set_readline(tok, cs);
240 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700242 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700244 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300245 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 "encoding problem: %s", cs);
247 PyMem_FREE(cs);
248 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 } else { /* then, compare cs with BOM */
251 r = (strcmp(tok->encoding, cs) == 0);
252 if (!r)
253 PyErr_Format(PyExc_SyntaxError,
254 "encoding problem: %s with BOM", cs);
255 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258}
259
260/* See whether the file starts with a BOM. If it does,
261 invoke the set_readline function with the new encoding.
262 Return 1 on success, 0 on failure. */
263
264static int
265check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 void unget_char(int, struct tok_state *),
267 int set_readline(struct tok_state *, const char *),
268 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 int ch1, ch2, ch3;
271 ch1 = get_char(tok);
272 tok->decoding_state = STATE_RAW;
273 if (ch1 == EOF) {
274 return 1;
275 } else if (ch1 == 0xEF) {
276 ch2 = get_char(tok);
277 if (ch2 != 0xBB) {
278 unget_char(ch2, tok);
279 unget_char(ch1, tok);
280 return 1;
281 }
282 ch3 = get_char(tok);
283 if (ch3 != 0xBF) {
284 unget_char(ch3, tok);
285 unget_char(ch2, tok);
286 unget_char(ch1, tok);
287 return 1;
288 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 /* Disable support for UTF-16 BOMs until a decision
291 is made whether this needs to be supported. */
292 } else if (ch1 == 0xFE) {
293 ch2 = get_char(tok);
294 if (ch2 != 0xFF) {
295 unget_char(ch2, tok);
296 unget_char(ch1, tok);
297 return 1;
298 }
299 if (!set_readline(tok, "utf-16-be"))
300 return 0;
301 tok->decoding_state = STATE_NORMAL;
302 } else if (ch1 == 0xFF) {
303 ch2 = get_char(tok);
304 if (ch2 != 0xFE) {
305 unget_char(ch2, tok);
306 unget_char(ch1, tok);
307 return 1;
308 }
309 if (!set_readline(tok, "utf-16-le"))
310 return 0;
311 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000312#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000313 } else {
314 unget_char(ch1, tok);
315 return 1;
316 }
317 if (tok->encoding != NULL)
318 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 tok->encoding = new_string("utf-8", 5, tok);
320 if (!tok->encoding)
321 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 /* No need to set_readline: input is already utf-8 */
323 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000324}
325
326/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000327 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000328
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000329 On entry, tok->decoding_buffer will be one of:
330 1) NULL: need to call tok->decoding_readline to get a new line
331 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000333 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 (in the s buffer) to copy entire contents of the line read
335 by tok->decoding_readline. tok->decoding_buffer has the overflow.
336 In this case, fp_readl is called in a loop (with an expanded buffer)
337 until the buffer ends with a '\n' (or until the end of the file is
338 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000339*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340
341static char *
342fp_readl(char *s, int size, struct tok_state *tok)
343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 PyObject* bufobj;
345 const char *buf;
346 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 /* Ask for one less byte so we can terminate it */
349 assert(size > 0);
350 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 if (tok->decoding_buffer) {
353 bufobj = tok->decoding_buffer;
354 Py_INCREF(bufobj);
355 }
356 else
357 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100358 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 if (bufobj == NULL)
360 goto error;
361 }
362 if (PyUnicode_CheckExact(bufobj))
363 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200364 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 if (buf == NULL) {
366 goto error;
367 }
368 }
369 else
370 {
371 buf = PyByteArray_AsString(bufobj);
372 if (buf == NULL) {
373 goto error;
374 }
375 buflen = PyByteArray_GET_SIZE(bufobj);
376 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 Py_XDECREF(tok->decoding_buffer);
379 if (buflen > size) {
380 /* Too many chars, the rest goes into tok->decoding_buffer */
381 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382 buflen-size);
383 if (tok->decoding_buffer == NULL)
384 goto error;
385 buflen = size;
386 }
387 else
388 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 memcpy(s, buf, buflen);
391 s[buflen] = '\0';
392 if (buflen == 0) /* EOF */
393 s = NULL;
394 Py_DECREF(bufobj);
395 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000396
397error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 Py_XDECREF(bufobj);
399 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400}
401
402/* Set the readline function for TOK to a StreamReader's
403 readline function. The StreamReader is named ENC.
404
405 This function is called from check_bom and check_coding_spec.
406
407 ENC is usually identical to the future value of tok->encoding,
408 except for the (currently unsupported) case of UTF-16.
409
410 Return 1 on success, 0 on failure. */
411
412static int
413fp_setreadl(struct tok_state *tok, const char* enc)
414{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700415 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200416 _Py_IDENTIFIER(open);
417 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000418 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200419 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420
Victor Stinner22a351a2010-10-14 12:04:34 +0000421 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200422 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100423 * position of tok->fp. If tok->fp was opened in text mode on Windows,
424 * its file position counts CRLF as one char and can't be directly mapped
425 * to the file offset for fd. Instead we step back one byte and read to
426 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200427 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100428 if (pos == -1 ||
429 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000430 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700431 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000432 }
433
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700434 io = PyImport_ImportModuleNoBlock("io");
435 if (io == NULL)
436 return 0;
437
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200438 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000439 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700440 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700442 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200444 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700445 Py_DECREF(stream);
446 if (readline == NULL)
447 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300448 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700449
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100450 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100451 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700452 if (bufobj == NULL)
453 return 0;
454 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100455 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700457 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458}
459
460/* Fetch the next byte from TOK. */
461
462static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464}
465
466/* Unfetch the last byte back into TOK. */
467
468static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000470}
471
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000472/* Check whether the characters at s start a valid
473 UTF-8 sequence. Return the number of characters forming
474 the sequence if yes, 0 if not. */
475static int valid_utf8(const unsigned char* s)
476{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 int expected = 0;
478 int length;
479 if (*s < 0x80)
480 /* single-byte code */
481 return 1;
482 if (*s < 0xc0)
483 /* following byte */
484 return 0;
485 if (*s < 0xE0)
486 expected = 1;
487 else if (*s < 0xF0)
488 expected = 2;
489 else if (*s < 0xF8)
490 expected = 3;
491 else
492 return 0;
493 length = expected + 1;
494 for (; expected; expected--)
495 if (s[expected] < 0x80 || s[expected] >= 0xC0)
496 return 0;
497 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000498}
499
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000500/* Read a line of input from TOK. Determine encoding
501 if necessary. */
502
503static char *
504decoding_fgets(char *s, int size, struct tok_state *tok)
505{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 char *line = NULL;
507 int badchar = 0;
508 for (;;) {
509 if (tok->decoding_state == STATE_NORMAL) {
510 /* We already have a codec associated with
511 this input. */
512 line = fp_readl(s, size, tok);
513 break;
514 } else if (tok->decoding_state == STATE_RAW) {
515 /* We want a 'raw' read. */
516 line = Py_UniversalNewlineFgets(s, size,
517 tok->fp, NULL);
518 break;
519 } else {
520 /* We have not yet determined the encoding.
521 If an encoding is found, use the file-pointer
522 reader functions from now on. */
523 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524 return error_ret(tok);
525 assert(tok->decoding_state != STATE_INIT);
526 }
527 }
528 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530 return error_ret(tok);
531 }
532 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000533 /* The default encoding is UTF-8, so make sure we don't have any
534 non-UTF-8 sequences in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 int length;
538 for (c = (unsigned char *)line; *c; c += length)
539 if (!(length = valid_utf8(c))) {
540 badchar = *c;
541 break;
542 }
543 }
544 if (badchar) {
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200547 PyErr_Format(PyExc_SyntaxError,
548 "Non-UTF-8 code starting with '\\x%.2x' "
549 "in file %U on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000553 return error_ret(tok);
554 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000555 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556}
557
558static int
559decoding_feof(struct tok_state *tok)
560{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 if (tok->decoding_state != STATE_NORMAL) {
562 return feof(tok->fp);
563 } else {
564 PyObject* buf = tok->decoding_buffer;
565 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100566 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000567 if (buf == NULL) {
568 error_ret(tok);
569 return 1;
570 } else {
571 tok->decoding_buffer = buf;
572 }
573 }
574 return PyObject_Length(buf) == 0;
575 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576}
577
578/* Fetch a byte from TOK, using the string buffer. */
579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000580static int
581buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000582 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583}
584
585/* Unfetch a byte from TOK, using the string buffer. */
586
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000587static void
588buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000589 tok->str--;
590 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
593/* Set the readline function for TOK to ENC. For the string-based
594 tokenizer, this means to just record the encoding. */
595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000596static int
597buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 tok->enc = enc;
599 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600}
601
602/* Return a UTF-8 encoding Python string object from the
603 C byte string STR, which is encoded with ENC. */
604
605static PyObject *
606translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 PyObject *utf8;
608 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609 if (buf == NULL)
610 return NULL;
611 utf8 = PyUnicode_AsUTF8String(buf);
612 Py_DECREF(buf);
613 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614}
615
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000616
617static char *
618translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200619 int skip_next_lf = 0;
620 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 char *buf, *current;
622 char c = '\0';
623 buf = PyMem_MALLOC(needed_length);
624 if (buf == NULL) {
625 tok->done = E_NOMEM;
626 return NULL;
627 }
628 for (current = buf; *s; s++, current++) {
629 c = *s;
630 if (skip_next_lf) {
631 skip_next_lf = 0;
632 if (c == '\n') {
633 c = *++s;
634 if (!c)
635 break;
636 }
637 }
638 if (c == '\r') {
639 skip_next_lf = 1;
640 c = '\n';
641 }
642 *current = c;
643 }
644 /* If this is exec input, add a newline to the end of the string if
645 there isn't one already. */
646 if (exec_input && c != '\n') {
647 *current = '\n';
648 current++;
649 }
650 *current = '\0';
651 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000652 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 /* should never fail */
Pablo Galindocb90c892019-03-19 17:17:58 +0000654 char* result = PyMem_REALLOC(buf, final_length);
655 if (result == NULL) {
656 PyMem_FREE(buf);
657 }
658 buf = result;
659 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000660 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000661}
662
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663/* Decode a byte string STR for use as the buffer of TOK.
664 Look for encoding declarations inside STR, and record them
665 inside TOK. */
666
667static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000668decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject* utf8 = NULL;
671 const char *str;
672 const char *s;
673 const char *newl[2] = {NULL, NULL};
674 int lineno = 0;
675 tok->input = str = translate_newlines(input, single, tok);
676 if (str == NULL)
677 return NULL;
678 tok->enc = NULL;
679 tok->str = str;
680 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681 return error_ret(tok);
682 str = tok->str; /* string after BOM if any */
683 assert(str);
684 if (tok->enc != NULL) {
685 utf8 = translate_into_utf8(str, tok->enc);
686 if (utf8 == NULL)
687 return error_ret(tok);
688 str = PyBytes_AsString(utf8);
689 }
690 for (s = str;; s++) {
691 if (*s == '\0') break;
692 else if (*s == '\n') {
693 assert(lineno < 2);
694 newl[lineno] = s;
695 lineno++;
696 if (lineno == 2) break;
697 }
698 }
699 tok->enc = NULL;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
702 if (newl[0]) {
703 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200705 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707 tok, buf_setreadl))
708 return error_ret(tok);
709 }
710 }
711 if (tok->enc != NULL) {
712 assert(utf8 == NULL);
713 utf8 = translate_into_utf8(str, tok->enc);
714 if (utf8 == NULL)
715 return error_ret(tok);
716 str = PyBytes_AS_STRING(utf8);
717 }
718 assert(tok->decoding_buffer == NULL);
719 tok->decoding_buffer = utf8; /* CAUTION */
720 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000721}
722
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723/* Set up tokenizer for string */
724
725struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000726PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000727{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 struct tok_state *tok = tok_new();
729 if (tok == NULL)
730 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300731 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (str == NULL) {
733 PyTokenizer_Free(tok);
734 return NULL;
735 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000736
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 /* XXX: constify members. */
738 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
739 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740}
741
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000742struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000743PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000744{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 struct tok_state *tok = tok_new();
746 if (tok == NULL)
747 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 tok->input = str = translate_newlines(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 if (str == NULL) {
750 PyTokenizer_Free(tok);
751 return NULL;
752 }
753 tok->decoding_state = STATE_RAW;
754 tok->read_coding_spec = 1;
755 tok->enc = NULL;
756 tok->str = str;
757 tok->encoding = (char *)PyMem_MALLOC(6);
758 if (!tok->encoding) {
759 PyTokenizer_Free(tok);
760 return NULL;
761 }
762 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000763
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 /* XXX: constify members. */
765 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
766 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000767}
768
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000769/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770
771struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300772PyTokenizer_FromFile(FILE *fp, const char* enc,
773 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 struct tok_state *tok = tok_new();
776 if (tok == NULL)
777 return NULL;
778 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
779 PyTokenizer_Free(tok);
780 return NULL;
781 }
782 tok->cur = tok->inp = tok->buf;
783 tok->end = tok->buf + BUFSIZ;
784 tok->fp = fp;
785 tok->prompt = ps1;
786 tok->nextprompt = ps2;
787 if (enc != NULL) {
788 /* Must copy encoding declaration since it
789 gets copied into the parse tree. */
790 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
791 if (!tok->encoding) {
792 PyTokenizer_Free(tok);
793 return NULL;
794 }
795 strcpy(tok->encoding, enc);
796 tok->decoding_state = STATE_NORMAL;
797 }
798 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799}
800
801
802/* Free a tok_state structure */
803
804void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000805PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 if (tok->encoding != NULL)
808 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 Py_XDECREF(tok->decoding_readline);
810 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200811 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 if (tok->fp != NULL && tok->buf != NULL)
813 PyMem_FREE(tok->buf);
814 if (tok->input)
815 PyMem_FREE((char *)tok->input);
816 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817}
818
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000819/* Get next char, updating state; error code goes into tok->done */
820
821static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200822tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 for (;;) {
825 if (tok->cur != tok->inp) {
826 return Py_CHARMASK(*tok->cur++); /* Fast path */
827 }
828 if (tok->done != E_OK)
829 return EOF;
830 if (tok->fp == NULL) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL)
833 end++;
834 else {
835 end = strchr(tok->inp, '\0');
836 if (end == tok->inp) {
837 tok->done = E_EOF;
838 return EOF;
839 }
840 }
841 if (tok->start == NULL)
842 tok->buf = tok->cur;
843 tok->line_start = tok->cur;
844 tok->lineno++;
845 tok->inp = end;
846 return Py_CHARMASK(*tok->cur++);
847 }
848 if (tok->prompt != NULL) {
849 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000850 if (newtok != NULL) {
851 char *translated = translate_newlines(newtok, 0, tok);
852 PyMem_FREE(newtok);
853 if (translated == NULL)
854 return EOF;
855 newtok = translated;
856 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 if (tok->encoding && newtok && *newtok) {
858 /* Recode to UTF-8 */
859 Py_ssize_t buflen;
860 const char* buf;
861 PyObject *u = translate_into_utf8(newtok, tok->encoding);
862 PyMem_FREE(newtok);
863 if (!u) {
864 tok->done = E_DECODE;
865 return EOF;
866 }
867 buflen = PyBytes_GET_SIZE(u);
868 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700870 if (newtok == NULL) {
871 Py_DECREF(u);
872 tok->done = E_NOMEM;
873 return EOF;
874 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000875 strcpy(newtok, buf);
876 Py_DECREF(u);
877 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000878 if (tok->nextprompt != NULL)
879 tok->prompt = tok->nextprompt;
880 if (newtok == NULL)
881 tok->done = E_INTR;
882 else if (*newtok == '\0') {
883 PyMem_FREE(newtok);
884 tok->done = E_EOF;
885 }
886 else if (tok->start != NULL) {
887 size_t start = tok->start - tok->buf;
888 size_t oldlen = tok->cur - tok->buf;
889 size_t newlen = oldlen + strlen(newtok);
Miss Islington (bot)b2e281a2020-01-06 08:26:13 -0800890 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000891 char *buf = tok->buf;
892 buf = (char *)PyMem_REALLOC(buf, newlen+1);
893 tok->lineno++;
894 if (buf == NULL) {
895 PyMem_FREE(tok->buf);
896 tok->buf = NULL;
897 PyMem_FREE(newtok);
898 tok->done = E_NOMEM;
899 return EOF;
900 }
901 tok->buf = buf;
902 tok->cur = tok->buf + oldlen;
Miss Islington (bot)b2e281a2020-01-06 08:26:13 -0800903 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 tok->line_start = tok->cur;
905 strcpy(tok->buf + oldlen, newtok);
906 PyMem_FREE(newtok);
907 tok->inp = tok->buf + newlen;
908 tok->end = tok->inp + 1;
909 tok->start = tok->buf + start;
910 }
911 else {
912 tok->lineno++;
913 if (tok->buf != NULL)
914 PyMem_FREE(tok->buf);
915 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916 tok->cur = tok->buf;
917 tok->line_start = tok->buf;
918 tok->inp = strchr(tok->buf, '\0');
919 tok->end = tok->inp + 1;
920 }
921 }
922 else {
923 int done = 0;
924 Py_ssize_t cur = 0;
925 char *pt;
926 if (tok->start == NULL) {
927 if (tok->buf == NULL) {
928 tok->buf = (char *)
929 PyMem_MALLOC(BUFSIZ);
930 if (tok->buf == NULL) {
931 tok->done = E_NOMEM;
932 return EOF;
933 }
934 tok->end = tok->buf + BUFSIZ;
935 }
936 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
937 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200938 if (!tok->decoding_erred)
939 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000940 done = 1;
941 }
942 else {
943 tok->done = E_OK;
944 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700945 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 }
947 }
948 else {
949 cur = tok->cur - tok->buf;
950 if (decoding_feof(tok)) {
951 tok->done = E_EOF;
952 done = 1;
953 }
954 else
955 tok->done = E_OK;
956 }
957 tok->lineno++;
958 /* Read until '\n' or EOF */
959 while (!done) {
960 Py_ssize_t curstart = tok->start == NULL ? -1 :
961 tok->start - tok->buf;
Miss Islington (bot)cf52bd02019-07-29 07:18:47 -0700962 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 Py_ssize_t curvalid = tok->inp - tok->buf;
964 Py_ssize_t newsize = curvalid + BUFSIZ;
965 char *newbuf = tok->buf;
966 newbuf = (char *)PyMem_REALLOC(newbuf,
967 newsize);
968 if (newbuf == NULL) {
969 tok->done = E_NOMEM;
970 tok->cur = tok->inp;
971 return EOF;
972 }
973 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200974 tok->cur = tok->buf + cur;
Miss Islington (bot)cf52bd02019-07-29 07:18:47 -0700975 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200976 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 tok->inp = tok->buf + curvalid;
978 tok->end = tok->buf + newsize;
979 tok->start = curstart < 0 ? NULL :
980 tok->buf + curstart;
981 if (decoding_fgets(tok->inp,
982 (int)(tok->end - tok->inp),
983 tok) == NULL) {
984 /* Break out early on decoding
985 errors, as tok->buf will be NULL
986 */
987 if (tok->decoding_erred)
988 return EOF;
989 /* Last line does not end in \n,
990 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -0700991 if (tok->inp[-1] != '\n')
992 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000993 }
994 tok->inp = strchr(tok->inp, '\0');
995 done = tok->inp[-1] == '\n';
996 }
997 if (tok->buf != NULL) {
998 tok->cur = tok->buf + cur;
999 tok->line_start = tok->cur;
1000 /* replace "\r\n" with "\n" */
1001 /* For Mac leave the \r, giving a syntax error */
1002 pt = tok->inp - 2;
1003 if (pt >= tok->buf && *pt == '\r') {
1004 *pt++ = '\n';
1005 *pt = '\0';
1006 tok->inp = pt;
1007 }
1008 }
1009 }
1010 if (tok->done != E_OK) {
1011 if (tok->prompt != NULL)
1012 PySys_WriteStderr("\n");
1013 tok->cur = tok->inp;
1014 return EOF;
1015 }
1016 }
1017 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001018}
1019
1020
1021/* Back-up one character */
1022
1023static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001024tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001026 if (c != EOF) {
1027 if (--tok->cur < tok->buf)
1028 Py_FatalError("tok_backup: beginning of buffer");
1029 if (*tok->cur != c)
1030 *tok->cur = c;
1031 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032}
1033
1034
Guido van Rossum926f13a1998-04-09 21:38:06 +00001035static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001036syntaxerror(struct tok_state *tok, const char *format, ...)
1037{
Miss Islington (bot)efd878c2020-02-12 02:35:10 -08001038 PyObject *errmsg, *errtext, *args;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001039 va_list vargs;
1040#ifdef HAVE_STDARG_PROTOTYPES
1041 va_start(vargs, format);
1042#else
1043 va_start(vargs);
1044#endif
Miss Islington (bot)efd878c2020-02-12 02:35:10 -08001045 errmsg = PyUnicode_FromFormatV(format, vargs);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001046 va_end(vargs);
Miss Islington (bot)efd878c2020-02-12 02:35:10 -08001047 if (!errmsg) {
1048 goto error;
1049 }
1050
1051 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1052 "replace");
1053 if (!errtext) {
1054 goto error;
1055 }
1056 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1057 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1058 if (line_len != tok->cur - tok->line_start) {
1059 Py_DECREF(errtext);
1060 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1061 "replace");
1062 }
1063 if (!errtext) {
1064 goto error;
1065 }
1066
1067 args = Py_BuildValue("(O(OiiN))", errmsg,
1068 tok->filename, tok->lineno, offset, errtext);
1069 if (args) {
1070 PyErr_SetObject(PyExc_SyntaxError, args);
1071 Py_DECREF(args);
1072 }
1073
1074error:
1075 Py_XDECREF(errmsg);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001076 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001077 return ERRORTOKEN;
1078}
1079
1080static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001081indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001082{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001083 tok->done = E_TABSPACE;
1084 tok->cur = tok->inp;
1085 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001086}
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088/* Verify that the identifier follows PEP 3131.
1089 All identifier strings are guaranteed to be "ready" unicode objects.
1090 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001091static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001092verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001093{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 PyObject *s;
1095 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001096 if (tok->decoding_erred)
1097 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001099 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1101 PyErr_Clear();
1102 tok->done = E_IDENTIFIER;
1103 } else {
1104 tok->done = E_ERROR;
1105 }
1106 return 0;
1107 }
1108 result = PyUnicode_IsIdentifier(s);
1109 Py_DECREF(s);
1110 if (result == 0)
1111 tok->done = E_IDENTIFIER;
1112 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001113}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001114
Brett Cannona721aba2016-09-09 14:57:09 -07001115static int
1116tok_decimal_tail(struct tok_state *tok)
1117{
1118 int c;
1119
1120 while (1) {
1121 do {
1122 c = tok_nextc(tok);
1123 } while (isdigit(c));
1124 if (c != '_') {
1125 break;
1126 }
1127 c = tok_nextc(tok);
1128 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001129 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001130 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001131 return 0;
1132 }
1133 }
1134 return c;
1135}
1136
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137/* Get next token, after space stripping etc. */
1138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001139static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001140tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001142 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001144
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 tok->start = NULL;
1148 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001150 /* Get indentation level */
1151 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001152 int col = 0;
1153 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001154 tok->atbol = 0;
1155 for (;;) {
1156 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001157 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001158 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001159 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001161 col = (col / tok->tabsize + 1) * tok->tabsize;
1162 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 }
Brett Cannona721aba2016-09-09 14:57:09 -07001164 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001166 }
1167 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001169 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 }
1171 tok_backup(tok, c);
1172 if (c == '#' || c == '\n') {
1173 /* Lines with only whitespace and/or comments
1174 shouldn't affect the indentation and are
1175 not passed to the parser as NEWLINE tokens,
1176 except *totally* empty lines in interactive
1177 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001178 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001180 }
Miss Islington (bot)184a3812019-12-08 20:56:19 -08001181 else if (tok->prompt != NULL && tok->lineno == 1) {
1182 /* In interactive mode, if the first line contains
1183 only spaces and/or a comment, let it through. */
1184 blankline = 0;
1185 col = altcol = 0;
1186 }
Brett Cannona721aba2016-09-09 14:57:09 -07001187 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001188 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001189 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 /* We can't jump back right here since we still
1191 may need to skip to the end of a comment */
1192 }
1193 if (!blankline && tok->level == 0) {
1194 if (col == tok->indstack[tok->indent]) {
1195 /* No change */
1196 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001197 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 }
1199 }
1200 else if (col > tok->indstack[tok->indent]) {
1201 /* Indent -- always one */
1202 if (tok->indent+1 >= MAXINDENT) {
1203 tok->done = E_TOODEEP;
1204 tok->cur = tok->inp;
1205 return ERRORTOKEN;
1206 }
1207 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001208 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001209 }
1210 tok->pendin++;
1211 tok->indstack[++tok->indent] = col;
1212 tok->altindstack[tok->indent] = altcol;
1213 }
1214 else /* col < tok->indstack[tok->indent] */ {
1215 /* Dedent -- any number, must be consistent */
1216 while (tok->indent > 0 &&
1217 col < tok->indstack[tok->indent]) {
1218 tok->pendin--;
1219 tok->indent--;
1220 }
1221 if (col != tok->indstack[tok->indent]) {
1222 tok->done = E_DEDENT;
1223 tok->cur = tok->inp;
1224 return ERRORTOKEN;
1225 }
1226 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001227 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 }
1229 }
1230 }
1231 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001232
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001235 /* Return pending indents/dedents */
1236 if (tok->pendin != 0) {
1237 if (tok->pendin < 0) {
1238 tok->pendin++;
1239 return DEDENT;
1240 }
1241 else {
1242 tok->pendin--;
1243 return INDENT;
1244 }
1245 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001246
Guido van Rossum495da292019-03-07 12:38:08 -08001247 /* Peek ahead at the next character */
1248 c = tok_nextc(tok);
1249 tok_backup(tok, c);
1250 /* Check if we are closing an async function */
1251 if (tok->async_def
1252 && !blankline
1253 /* Due to some implementation artifacts of type comments,
1254 * a TYPE_COMMENT at the start of a function won't set an
1255 * indentation level and it will produce a NEWLINE after it.
1256 * To avoid spuriously ending an async function due to this,
1257 * wait until we have some non-newline char in front of us. */
1258 && c != '\n'
1259 && tok->level == 0
1260 /* There was a NEWLINE after ASYNC DEF,
1261 so we're past the signature. */
1262 && tok->async_def_nl
1263 /* Current indentation level is less than where
1264 the async function was defined */
1265 && tok->async_def_indent >= tok->indent)
1266 {
1267 tok->async_def = 0;
1268 tok->async_def_indent = 0;
1269 tok->async_def_nl = 0;
1270 }
1271
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 tok->start = NULL;
1274 /* Skip spaces */
1275 do {
1276 c = tok_nextc(tok);
1277 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001278
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 /* Set start of current token */
1280 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001281
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001282 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001283 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001284 const char *prefix, *p, *type_start;
1285
Brett Cannona721aba2016-09-09 14:57:09 -07001286 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001288 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001289
1290 if (tok->type_comments) {
1291 p = tok->start;
1292 prefix = type_comment_prefix;
1293 while (*prefix && p < tok->cur) {
1294 if (*prefix == ' ') {
1295 while (*p == ' ' || *p == '\t') {
1296 p++;
1297 }
1298 } else if (*prefix == *p) {
1299 p++;
1300 } else {
1301 break;
1302 }
1303
1304 prefix++;
1305 }
1306
1307 /* This is a type comment if we matched all of type_comment_prefix. */
1308 if (!*prefix) {
1309 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001310 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001311 tok_backup(tok, c); /* don't eat the newline or EOF */
1312
1313 type_start = p;
1314
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001315 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001316 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001317 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001318 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001319 && !(tok->cur > ignore_end
1320 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001321
1322 if (is_type_ignore) {
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001323 *p_start = (char *) ignore_end;
1324 *p_end = tok->cur;
1325
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001326 /* If this type ignore is the only thing on the line, consume the newline also. */
1327 if (blankline) {
1328 tok_nextc(tok);
1329 tok->atbol = 1;
1330 }
1331 return TYPE_IGNORE;
1332 } else {
1333 *p_start = (char *) type_start; /* after type_comment_prefix */
1334 *p_end = tok->cur;
1335 return TYPE_COMMENT;
1336 }
1337 }
1338 }
Brett Cannona721aba2016-09-09 14:57:09 -07001339 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001340
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 /* Check for EOF and errors now */
1342 if (c == EOF) {
1343 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1344 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001345
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 /* Identifier (most frequent token!) */
1347 nonascii = 0;
1348 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001349 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001350 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001351 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001352 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001353 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001354 /* Since this is a backwards compatibility support literal we don't
1355 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001356 else if (!(saw_b || saw_u || saw_r || saw_f)
1357 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001358 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001359 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001360 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001361 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001362 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001363 }
1364 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001365 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001366 }
1367 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001368 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001369 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001371 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001373 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 }
1375 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001376 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001378 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 c = tok_nextc(tok);
1380 }
1381 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001382 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001384 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 *p_start = tok->start;
1386 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001387
Guido van Rossum495da292019-03-07 12:38:08 -08001388 /* async/await parsing block. */
1389 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1390 /* May be an 'async' or 'await' token. For Python 3.7 or
1391 later we recognize them unconditionally. For Python
1392 3.5 or 3.6 we recognize 'async' in front of 'def', and
1393 either one inside of 'async def'. (Technically we
1394 shouldn't recognize these at all for 3.4 or earlier,
1395 but there's no *valid* Python 3.4 code that would be
1396 rejected, and async functions will be rejected in a
1397 later phase.) */
1398 if (!tok->async_hacks || tok->async_def) {
1399 /* Always recognize the keywords. */
1400 if (memcmp(tok->start, "async", 5) == 0) {
1401 return ASYNC;
1402 }
1403 if (memcmp(tok->start, "await", 5) == 0) {
1404 return AWAIT;
1405 }
1406 }
1407 else if (memcmp(tok->start, "async", 5) == 0) {
1408 /* The current token is 'async'.
1409 Look ahead one token to see if that is 'def'. */
1410
1411 struct tok_state ahead_tok;
1412 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1413 int ahead_tok_kind;
1414
1415 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1416 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1417 &ahead_tok_end);
1418
1419 if (ahead_tok_kind == NAME
1420 && ahead_tok.cur - ahead_tok.start == 3
1421 && memcmp(ahead_tok.start, "def", 3) == 0)
1422 {
1423 /* The next token is going to be 'def', so instead of
1424 returning a plain NAME token, return ASYNC. */
1425 tok->async_def_indent = tok->indent;
1426 tok->async_def = 1;
1427 return ASYNC;
1428 }
1429 }
1430 }
1431
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 return NAME;
1433 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 /* Newline */
1436 if (c == '\n') {
1437 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001438 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001440 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 *p_start = tok->start;
1442 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1443 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001444 if (tok->async_def) {
1445 /* We're somewhere inside an 'async def' function, and
1446 we've encountered a NEWLINE after its signature. */
1447 tok->async_def_nl = 1;
1448 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 return NEWLINE;
1450 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 /* Period or number starting with period? */
1453 if (c == '.') {
1454 c = tok_nextc(tok);
1455 if (isdigit(c)) {
1456 goto fraction;
1457 } else if (c == '.') {
1458 c = tok_nextc(tok);
1459 if (c == '.') {
1460 *p_start = tok->start;
1461 *p_end = tok->cur;
1462 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001463 }
1464 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 tok_backup(tok, c);
1466 }
1467 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001468 }
1469 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001470 tok_backup(tok, c);
1471 }
1472 *p_start = tok->start;
1473 *p_end = tok->cur;
1474 return DOT;
1475 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001476
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 /* Number */
1478 if (isdigit(c)) {
1479 if (c == '0') {
1480 /* Hex, octal or binary -- maybe. */
1481 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 /* Hex */
1484 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001486 if (c == '_') {
1487 c = tok_nextc(tok);
1488 }
1489 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001490 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001491 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001492 }
1493 do {
1494 c = tok_nextc(tok);
1495 } while (isxdigit(c));
1496 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 }
1498 else if (c == 'o' || c == 'O') {
1499 /* Octal */
1500 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001502 if (c == '_') {
1503 c = tok_nextc(tok);
1504 }
1505 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001506 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001507 if (isdigit(c)) {
1508 return syntaxerror(tok,
1509 "invalid digit '%c' in octal literal", c);
1510 }
1511 else {
1512 return syntaxerror(tok, "invalid octal literal");
1513 }
Brett Cannona721aba2016-09-09 14:57:09 -07001514 }
1515 do {
1516 c = tok_nextc(tok);
1517 } while ('0' <= c && c < '8');
1518 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001519 if (isdigit(c)) {
1520 return syntaxerror(tok,
1521 "invalid digit '%c' in octal literal", c);
1522 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001523 }
1524 else if (c == 'b' || c == 'B') {
1525 /* Binary */
1526 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001528 if (c == '_') {
1529 c = tok_nextc(tok);
1530 }
1531 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001532 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001533 if (isdigit(c)) {
1534 return syntaxerror(tok,
1535 "invalid digit '%c' in binary literal", c);
1536 }
1537 else {
1538 return syntaxerror(tok, "invalid binary literal");
1539 }
Brett Cannona721aba2016-09-09 14:57:09 -07001540 }
1541 do {
1542 c = tok_nextc(tok);
1543 } while (c == '0' || c == '1');
1544 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001545 if (isdigit(c)) {
1546 return syntaxerror(tok,
1547 "invalid digit '%c' in binary literal", c);
1548 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 }
1550 else {
1551 int nonzero = 0;
1552 /* maybe old-style octal; c is first char of it */
1553 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001554 while (1) {
1555 if (c == '_') {
1556 c = tok_nextc(tok);
1557 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001558 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001559 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001560 }
1561 }
1562 if (c != '0') {
1563 break;
1564 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001565 c = tok_nextc(tok);
1566 }
Brett Cannona721aba2016-09-09 14:57:09 -07001567 if (isdigit(c)) {
1568 nonzero = 1;
1569 c = tok_decimal_tail(tok);
1570 if (c == 0) {
1571 return ERRORTOKEN;
1572 }
1573 }
1574 if (c == '.') {
1575 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001576 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001577 }
1578 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001579 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001580 }
1581 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001582 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001583 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001585 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001586 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001587 return syntaxerror(tok,
1588 "leading zeros in decimal integer "
1589 "literals are not permitted; "
1590 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 }
1592 }
1593 }
1594 else {
1595 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001596 c = tok_decimal_tail(tok);
1597 if (c == 0) {
1598 return ERRORTOKEN;
1599 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 {
1601 /* Accept floating point numbers. */
1602 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001603 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 fraction:
1605 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001606 if (isdigit(c)) {
1607 c = tok_decimal_tail(tok);
1608 if (c == 0) {
1609 return ERRORTOKEN;
1610 }
1611 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 }
1613 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001614 int e;
1615 exponent:
1616 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 /* Exponent part */
1618 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001619 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001621 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001622 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001623 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001624 }
1625 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001627 tok_backup(tok, e);
1628 *p_start = tok->start;
1629 *p_end = tok->cur;
1630 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 }
Brett Cannona721aba2016-09-09 14:57:09 -07001632 c = tok_decimal_tail(tok);
1633 if (c == 0) {
1634 return ERRORTOKEN;
1635 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 }
Brett Cannona721aba2016-09-09 14:57:09 -07001637 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 /* Imaginary part */
1639 imaginary:
1640 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001641 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 }
1643 }
1644 tok_backup(tok, c);
1645 *p_start = tok->start;
1646 *p_end = tok->cur;
1647 return NUMBER;
1648 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001649
1650 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 /* String */
1652 if (c == '\'' || c == '"') {
1653 int quote = c;
1654 int quote_size = 1; /* 1 or 3 */
1655 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001656
Anthony Sottile995d9b92019-01-12 20:05:13 -08001657 /* Nodes of type STRING, especially multi line strings
1658 must be handled differently in order to get both
1659 the starting line number and the column offset right.
1660 (cf. issue 16806) */
1661 tok->first_lineno = tok->lineno;
1662 tok->multi_line_start = tok->line_start;
1663
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 /* Find the quote size and start of string */
1665 c = tok_nextc(tok);
1666 if (c == quote) {
1667 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001668 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001670 }
1671 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001672 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001673 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 }
Brett Cannona721aba2016-09-09 14:57:09 -07001675 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001677 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001678
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 /* Get rest of string */
1680 while (end_quote_size != quote_size) {
1681 c = tok_nextc(tok);
1682 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001683 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001685 }
1686 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001688 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 tok->cur = tok->inp;
1690 return ERRORTOKEN;
1691 }
1692 if (quote_size == 1 && c == '\n') {
1693 tok->done = E_EOLS;
1694 tok->cur = tok->inp;
1695 return ERRORTOKEN;
1696 }
Brett Cannona721aba2016-09-09 14:57:09 -07001697 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001699 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 else {
1701 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001702 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001703 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001704 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 }
1706 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001707
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 *p_start = tok->start;
1709 *p_end = tok->cur;
1710 return STRING;
1711 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001712
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713 /* Line continuation */
1714 if (c == '\\') {
1715 c = tok_nextc(tok);
1716 if (c != '\n') {
1717 tok->done = E_LINECONT;
1718 tok->cur = tok->inp;
1719 return ERRORTOKEN;
1720 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001721 c = tok_nextc(tok);
1722 if (c == EOF) {
1723 tok->done = E_EOF;
1724 tok->cur = tok->inp;
1725 return ERRORTOKEN;
1726 } else {
1727 tok_backup(tok, c);
1728 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 tok->cont_line = 1;
1730 goto again; /* Read next line */
1731 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001732
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 /* Check for two-character token */
1734 {
1735 int c2 = tok_nextc(tok);
1736 int token = PyToken_TwoChars(c, c2);
1737 if (token != OP) {
1738 int c3 = tok_nextc(tok);
1739 int token3 = PyToken_ThreeChars(c, c2, c3);
1740 if (token3 != OP) {
1741 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001742 }
1743 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 tok_backup(tok, c3);
1745 }
1746 *p_start = tok->start;
1747 *p_end = tok->cur;
1748 return token;
1749 }
1750 tok_backup(tok, c2);
1751 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001752
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 /* Keep track of parentheses nesting level */
1754 switch (c) {
1755 case '(':
1756 case '[':
1757 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001758 if (tok->level >= MAXLEVEL) {
1759 return syntaxerror(tok, "too many nested parentheses");
1760 }
1761 tok->parenstack[tok->level] = c;
1762 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001763 tok->level++;
1764 break;
1765 case ')':
1766 case ']':
1767 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001768 if (!tok->level) {
1769 return syntaxerror(tok, "unmatched '%c'", c);
1770 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001771 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001772 int opening = tok->parenstack[tok->level];
1773 if (!((opening == '(' && c == ')') ||
1774 (opening == '[' && c == ']') ||
1775 (opening == '{' && c == '}')))
1776 {
1777 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1778 return syntaxerror(tok,
1779 "closing parenthesis '%c' does not match "
1780 "opening parenthesis '%c' on line %d",
1781 c, opening, tok->parenlinenostack[tok->level]);
1782 }
1783 else {
1784 return syntaxerror(tok,
1785 "closing parenthesis '%c' does not match "
1786 "opening parenthesis '%c'",
1787 c, opening);
1788 }
1789 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001790 break;
1791 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001792
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001793 /* Punctuation character */
1794 *p_start = tok->start;
1795 *p_end = tok->cur;
1796 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001797}
1798
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001799int
1800PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1801{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 int result = tok_get(tok, p_start, p_end);
1803 if (tok->decoding_erred) {
1804 result = ERRORTOKEN;
1805 tok->done = E_DECODE;
1806 }
1807 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001808}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001809
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001810/* Get the encoding of a Python file. Check for the coding cookie and check if
1811 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001812
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001813 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1814 encoding in the first or second line of the file (in which case the encoding
1815 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001816
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001817 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1818 by the caller. */
1819
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001820char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001821PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001822{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001823 struct tok_state *tok;
1824 FILE *fp;
1825 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001826
Victor Stinnerdaf45552013-08-28 00:53:59 +02001827 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 if (fd < 0) {
1829 return NULL;
1830 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001831
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001832 fp = fdopen(fd, "r");
1833 if (fp == NULL) {
1834 return NULL;
1835 }
1836 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1837 if (tok == NULL) {
1838 fclose(fp);
1839 return NULL;
1840 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001841 if (filename != NULL) {
1842 Py_INCREF(filename);
1843 tok->filename = filename;
1844 }
1845 else {
1846 tok->filename = PyUnicode_FromString("<string>");
1847 if (tok->filename == NULL) {
1848 fclose(fp);
1849 PyTokenizer_Free(tok);
1850 return encoding;
1851 }
1852 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001853 while (tok->lineno < 2 && tok->done == E_OK) {
1854 PyTokenizer_Get(tok, &p_start, &p_end);
1855 }
1856 fclose(fp);
1857 if (tok->encoding) {
1858 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1859 if (encoding)
Miss Islington (bot)64db5aa2019-08-15 09:38:22 -07001860 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001861 }
1862 PyTokenizer_Free(tok);
1863 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001864}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001865
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001866char *
1867PyTokenizer_FindEncoding(int fd)
1868{
1869 return PyTokenizer_FindEncodingFilename(fd, NULL);
1870}
1871
Guido van Rossum408027e1996-12-30 16:17:54 +00001872#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001873
1874void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001875tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001876{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 printf("%s", _PyParser_TokenNames[type]);
1878 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1879 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001880}
1881
1882#endif