blob: f73c32684c7b73fb3169c68e3a4302dd8fe7a4b6 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00006#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00007#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00008
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009#include "tokenizer.h"
10#include "errcode.h"
11
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000012#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000013#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000014#include "fileobject.h"
15#include "codecs.h"
16#include "abstract.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000017
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080018/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossumdcfcd142019-01-31 03:40:27 -080048/* Spaces in this constant are treated as "zero or more spaces or tabs" when
49 tokenizing. */
50static const char* type_comment_prefix = "# type: ";
51
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000052/* Create and initialize a new tok_state structure */
53
54static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000055tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000057 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
58 sizeof(struct tok_state));
59 if (tok == NULL)
60 return NULL;
61 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
62 tok->done = E_OK;
63 tok->fp = NULL;
64 tok->input = NULL;
65 tok->tabsize = TABSIZE;
66 tok->indent = 0;
67 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040068
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000069 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000074 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
77 tok->read_coding_spec = 0;
78 tok->enc = NULL;
79 tok->encoding = NULL;
80 tok->cont_line = 0;
Victor Stinner7f2fee32011-04-05 00:39:01 +020081 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
Guido van Rossumdcfcd142019-01-31 03:40:27 -080084 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030085
Guido van Rossum495da292019-03-07 12:38:08 -080086 tok->async_hacks = 0;
87 tok->async_def = 0;
88 tok->async_def_indent = 0;
89 tok->async_def_nl = 0;
90
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000092}
93
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000094static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070095new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000096{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000097 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098 if (!result) {
99 tok->done = E_NOMEM;
100 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700102 memcpy(result, s, len);
103 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000105}
106
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000107static char *
108error_ret(struct tok_state *tok) /* XXX */
109{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000110 tok->decoding_erred = 1;
111 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
112 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
114 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000116}
117
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000118
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200119static const char *
120get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000122 char buf[13];
123 int i;
124 for (i = 0; i < 12; i++) {
125 int c = s[i];
126 if (c == '\0')
127 break;
128 else if (c == '_')
129 buf[i] = '-';
130 else
131 buf[i] = tolower(c);
132 }
133 buf[i] = '\0';
134 if (strcmp(buf, "utf-8") == 0 ||
135 strncmp(buf, "utf-8-", 6) == 0)
136 return "utf-8";
137 else if (strcmp(buf, "latin-1") == 0 ||
138 strcmp(buf, "iso-8859-1") == 0 ||
139 strcmp(buf, "iso-latin-1") == 0 ||
140 strncmp(buf, "latin-1-", 8) == 0 ||
141 strncmp(buf, "iso-8859-1-", 11) == 0 ||
142 strncmp(buf, "iso-latin-1-", 12) == 0)
143 return "iso-8859-1";
144 else
145 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146}
147
148/* Return the coding spec in S, or NULL if none is found. */
149
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150static int
151get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700154 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 /* Coding spec must be in a comment, and that comment must be
156 * the only statement on the source code line. */
157 for (i = 0; i < size - 6; i++) {
158 if (s[i] == '#')
159 break;
160 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700161 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
163 for (; i < size - 6; i++) { /* XXX inefficient search */
164 const char* t = s + i;
165 if (strncmp(t, "coding", 6) == 0) {
166 const char* begin = NULL;
167 t += 6;
168 if (t[0] != ':' && t[0] != '=')
169 continue;
170 do {
171 t++;
172 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 begin = t;
175 while (Py_ISALNUM(t[0]) ||
176 t[0] == '-' || t[0] == '_' || t[0] == '.')
177 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700180 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200181 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700182 if (!r)
183 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700184 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 if (r != q) {
186 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 r = new_string(q, strlen(q), tok);
188 if (!r)
189 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700191 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200192 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 }
194 }
195 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700196 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197}
198
199/* Check whether the line contains a coding spec. If it does,
200 invoke the set_readline function for the new encoding.
201 This function receives the tok_state and the new encoding.
202 Return 1 on success, 0 on failure. */
203
204static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000205check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000206 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700208 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000209 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000210
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200211 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000212 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200213 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200215 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700216 if (!get_coding_spec(line, &cs, size, tok))
217 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
226 tok->read_coding_spec = 1;
227 break;
228 }
229 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200231 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700232 tok->read_coding_spec = 1;
233 if (tok->encoding == NULL) {
234 assert(tok->decoding_state == STATE_RAW);
235 if (strcmp(cs, "utf-8") == 0) {
236 tok->encoding = cs;
237 } else {
238 r = set_readline(tok, cs);
239 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700241 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700243 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300244 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 "encoding problem: %s", cs);
246 PyMem_FREE(cs);
247 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700249 } else { /* then, compare cs with BOM */
250 r = (strcmp(tok->encoding, cs) == 0);
251 if (!r)
252 PyErr_Format(PyExc_SyntaxError,
253 "encoding problem: %s with BOM", cs);
254 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257}
258
259/* See whether the file starts with a BOM. If it does,
260 invoke the set_readline function with the new encoding.
261 Return 1 on success, 0 on failure. */
262
263static int
264check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 void unget_char(int, struct tok_state *),
266 int set_readline(struct tok_state *, const char *),
267 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 int ch1, ch2, ch3;
270 ch1 = get_char(tok);
271 tok->decoding_state = STATE_RAW;
272 if (ch1 == EOF) {
273 return 1;
274 } else if (ch1 == 0xEF) {
275 ch2 = get_char(tok);
276 if (ch2 != 0xBB) {
277 unget_char(ch2, tok);
278 unget_char(ch1, tok);
279 return 1;
280 }
281 ch3 = get_char(tok);
282 if (ch3 != 0xBF) {
283 unget_char(ch3, tok);
284 unget_char(ch2, tok);
285 unget_char(ch1, tok);
286 return 1;
287 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 /* Disable support for UTF-16 BOMs until a decision
290 is made whether this needs to be supported. */
291 } else if (ch1 == 0xFE) {
292 ch2 = get_char(tok);
293 if (ch2 != 0xFF) {
294 unget_char(ch2, tok);
295 unget_char(ch1, tok);
296 return 1;
297 }
298 if (!set_readline(tok, "utf-16-be"))
299 return 0;
300 tok->decoding_state = STATE_NORMAL;
301 } else if (ch1 == 0xFF) {
302 ch2 = get_char(tok);
303 if (ch2 != 0xFE) {
304 unget_char(ch2, tok);
305 unget_char(ch1, tok);
306 return 1;
307 }
308 if (!set_readline(tok, "utf-16-le"))
309 return 0;
310 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000311#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000312 } else {
313 unget_char(ch1, tok);
314 return 1;
315 }
316 if (tok->encoding != NULL)
317 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 tok->encoding = new_string("utf-8", 5, tok);
319 if (!tok->encoding)
320 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 /* No need to set_readline: input is already utf-8 */
322 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323}
324
325/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000326 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000327
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000328 On entry, tok->decoding_buffer will be one of:
329 1) NULL: need to call tok->decoding_readline to get a new line
330 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000332 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000333 (in the s buffer) to copy entire contents of the line read
334 by tok->decoding_readline. tok->decoding_buffer has the overflow.
335 In this case, fp_readl is called in a loop (with an expanded buffer)
336 until the buffer ends with a '\n' (or until the end of the file is
337 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000338*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339
340static char *
341fp_readl(char *s, int size, struct tok_state *tok)
342{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000343 PyObject* bufobj;
344 const char *buf;
345 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000346
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000347 /* Ask for one less byte so we can terminate it */
348 assert(size > 0);
349 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000350
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 if (tok->decoding_buffer) {
352 bufobj = tok->decoding_buffer;
353 Py_INCREF(bufobj);
354 }
355 else
356 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100357 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 if (bufobj == NULL)
359 goto error;
360 }
361 if (PyUnicode_CheckExact(bufobj))
362 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200363 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 if (buf == NULL) {
365 goto error;
366 }
367 }
368 else
369 {
370 buf = PyByteArray_AsString(bufobj);
371 if (buf == NULL) {
372 goto error;
373 }
374 buflen = PyByteArray_GET_SIZE(bufobj);
375 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000376
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 Py_XDECREF(tok->decoding_buffer);
378 if (buflen > size) {
379 /* Too many chars, the rest goes into tok->decoding_buffer */
380 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
381 buflen-size);
382 if (tok->decoding_buffer == NULL)
383 goto error;
384 buflen = size;
385 }
386 else
387 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000388
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000389 memcpy(s, buf, buflen);
390 s[buflen] = '\0';
391 if (buflen == 0) /* EOF */
392 s = NULL;
393 Py_DECREF(bufobj);
394 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000395
396error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 Py_XDECREF(bufobj);
398 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399}
400
401/* Set the readline function for TOK to a StreamReader's
402 readline function. The StreamReader is named ENC.
403
404 This function is called from check_bom and check_coding_spec.
405
406 ENC is usually identical to the future value of tok->encoding,
407 except for the (currently unsupported) case of UTF-16.
408
409 Return 1 on success, 0 on failure. */
410
411static int
412fp_setreadl(struct tok_state *tok, const char* enc)
413{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700414 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200415 _Py_IDENTIFIER(open);
416 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000417 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200418 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000419
Victor Stinner22a351a2010-10-14 12:04:34 +0000420 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200421 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100422 * position of tok->fp. If tok->fp was opened in text mode on Windows,
423 * its file position counts CRLF as one char and can't be directly mapped
424 * to the file offset for fd. Instead we step back one byte and read to
425 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200426 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100427 if (pos == -1 ||
428 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000429 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700430 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000431 }
432
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700433 io = PyImport_ImportModuleNoBlock("io");
434 if (io == NULL)
435 return 0;
436
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200437 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000438 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700439 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700441 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000442
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200443 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700444 Py_DECREF(stream);
445 if (readline == NULL)
446 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300447 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700448
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100449 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100450 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700451 if (bufobj == NULL)
452 return 0;
453 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100454 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000455
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000457}
458
459/* Fetch the next byte from TOK. */
460
461static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463}
464
465/* Unfetch the last byte back into TOK. */
466
467static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469}
470
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000471/* Check whether the characters at s start a valid
472 UTF-8 sequence. Return the number of characters forming
473 the sequence if yes, 0 if not. */
474static int valid_utf8(const unsigned char* s)
475{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 int expected = 0;
477 int length;
478 if (*s < 0x80)
479 /* single-byte code */
480 return 1;
481 if (*s < 0xc0)
482 /* following byte */
483 return 0;
484 if (*s < 0xE0)
485 expected = 1;
486 else if (*s < 0xF0)
487 expected = 2;
488 else if (*s < 0xF8)
489 expected = 3;
490 else
491 return 0;
492 length = expected + 1;
493 for (; expected; expected--)
494 if (s[expected] < 0x80 || s[expected] >= 0xC0)
495 return 0;
496 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000497}
498
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499/* Read a line of input from TOK. Determine encoding
500 if necessary. */
501
502static char *
503decoding_fgets(char *s, int size, struct tok_state *tok)
504{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505 char *line = NULL;
506 int badchar = 0;
507 for (;;) {
508 if (tok->decoding_state == STATE_NORMAL) {
509 /* We already have a codec associated with
510 this input. */
511 line = fp_readl(s, size, tok);
512 break;
513 } else if (tok->decoding_state == STATE_RAW) {
514 /* We want a 'raw' read. */
515 line = Py_UniversalNewlineFgets(s, size,
516 tok->fp, NULL);
517 break;
518 } else {
519 /* We have not yet determined the encoding.
520 If an encoding is found, use the file-pointer
521 reader functions from now on. */
522 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
523 return error_ret(tok);
524 assert(tok->decoding_state != STATE_INIT);
525 }
526 }
527 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
528 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
529 return error_ret(tok);
530 }
531 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 /* The default encoding is UTF-8, so make sure we don't have any
533 non-UTF-8 sequences in it. */
534 if (line && !tok->encoding) {
535 unsigned char *c;
536 int length;
537 for (c = (unsigned char *)line; *c; c += length)
538 if (!(length = valid_utf8(c))) {
539 badchar = *c;
540 break;
541 }
542 }
543 if (badchar) {
544 /* Need to add 1 to the line number, since this line
545 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200546 PyErr_Format(PyExc_SyntaxError,
547 "Non-UTF-8 code starting with '\\x%.2x' "
548 "in file %U on line %i, "
549 "but no encoding declared; "
550 "see http://python.org/dev/peps/pep-0263/ for details",
551 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000552 return error_ret(tok);
553 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000554 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555}
556
557static int
558decoding_feof(struct tok_state *tok)
559{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000560 if (tok->decoding_state != STATE_NORMAL) {
561 return feof(tok->fp);
562 } else {
563 PyObject* buf = tok->decoding_buffer;
564 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100565 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 if (buf == NULL) {
567 error_ret(tok);
568 return 1;
569 } else {
570 tok->decoding_buffer = buf;
571 }
572 }
573 return PyObject_Length(buf) == 0;
574 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575}
576
577/* Fetch a byte from TOK, using the string buffer. */
578
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000579static int
580buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000581 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582}
583
584/* Unfetch a byte from TOK, using the string buffer. */
585
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000586static void
587buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 tok->str--;
589 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590}
591
592/* Set the readline function for TOK to ENC. For the string-based
593 tokenizer, this means to just record the encoding. */
594
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000595static int
596buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000597 tok->enc = enc;
598 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599}
600
601/* Return a UTF-8 encoding Python string object from the
602 C byte string STR, which is encoded with ENC. */
603
604static PyObject *
605translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000606 PyObject *utf8;
607 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
608 if (buf == NULL)
609 return NULL;
610 utf8 = PyUnicode_AsUTF8String(buf);
611 Py_DECREF(buf);
612 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613}
614
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000615
616static char *
617translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200618 int skip_next_lf = 0;
619 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 char *buf, *current;
621 char c = '\0';
622 buf = PyMem_MALLOC(needed_length);
623 if (buf == NULL) {
624 tok->done = E_NOMEM;
625 return NULL;
626 }
627 for (current = buf; *s; s++, current++) {
628 c = *s;
629 if (skip_next_lf) {
630 skip_next_lf = 0;
631 if (c == '\n') {
632 c = *++s;
633 if (!c)
634 break;
635 }
636 }
637 if (c == '\r') {
638 skip_next_lf = 1;
639 c = '\n';
640 }
641 *current = c;
642 }
643 /* If this is exec input, add a newline to the end of the string if
644 there isn't one already. */
645 if (exec_input && c != '\n') {
646 *current = '\n';
647 current++;
648 }
649 *current = '\0';
650 final_length = current - buf + 1;
Pablo Galindocb90c892019-03-19 17:17:58 +0000651 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000652 /* should never fail */
Pablo Galindocb90c892019-03-19 17:17:58 +0000653 char* result = PyMem_REALLOC(buf, final_length);
654 if (result == NULL) {
655 PyMem_FREE(buf);
656 }
657 buf = result;
658 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000660}
661
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662/* Decode a byte string STR for use as the buffer of TOK.
663 Look for encoding declarations inside STR, and record them
664 inside TOK. */
665
666static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000667decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 PyObject* utf8 = NULL;
670 const char *str;
671 const char *s;
672 const char *newl[2] = {NULL, NULL};
673 int lineno = 0;
674 tok->input = str = translate_newlines(input, single, tok);
675 if (str == NULL)
676 return NULL;
677 tok->enc = NULL;
678 tok->str = str;
679 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
680 return error_ret(tok);
681 str = tok->str; /* string after BOM if any */
682 assert(str);
683 if (tok->enc != NULL) {
684 utf8 = translate_into_utf8(str, tok->enc);
685 if (utf8 == NULL)
686 return error_ret(tok);
687 str = PyBytes_AsString(utf8);
688 }
689 for (s = str;; s++) {
690 if (*s == '\0') break;
691 else if (*s == '\n') {
692 assert(lineno < 2);
693 newl[lineno] = s;
694 lineno++;
695 if (lineno == 2) break;
696 }
697 }
698 tok->enc = NULL;
699 /* need to check line 1 and 2 separately since check_coding_spec
700 assumes a single line as input */
701 if (newl[0]) {
702 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
703 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200704 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
706 tok, buf_setreadl))
707 return error_ret(tok);
708 }
709 }
710 if (tok->enc != NULL) {
711 assert(utf8 == NULL);
712 utf8 = translate_into_utf8(str, tok->enc);
713 if (utf8 == NULL)
714 return error_ret(tok);
715 str = PyBytes_AS_STRING(utf8);
716 }
717 assert(tok->decoding_buffer == NULL);
718 tok->decoding_buffer = utf8; /* CAUTION */
719 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000720}
721
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722/* Set up tokenizer for string */
723
724struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000725PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 struct tok_state *tok = tok_new();
728 if (tok == NULL)
729 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300730 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 if (str == NULL) {
732 PyTokenizer_Free(tok);
733 return NULL;
734 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000735
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 /* XXX: constify members. */
737 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
738 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739}
740
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000741struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000742PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000743{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 struct tok_state *tok = tok_new();
745 if (tok == NULL)
746 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 tok->input = str = translate_newlines(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 if (str == NULL) {
749 PyTokenizer_Free(tok);
750 return NULL;
751 }
752 tok->decoding_state = STATE_RAW;
753 tok->read_coding_spec = 1;
754 tok->enc = NULL;
755 tok->str = str;
756 tok->encoding = (char *)PyMem_MALLOC(6);
757 if (!tok->encoding) {
758 PyTokenizer_Free(tok);
759 return NULL;
760 }
761 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000762
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 /* XXX: constify members. */
764 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
765 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000766}
767
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000768/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769
770struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300771PyTokenizer_FromFile(FILE *fp, const char* enc,
772 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000774 struct tok_state *tok = tok_new();
775 if (tok == NULL)
776 return NULL;
777 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
778 PyTokenizer_Free(tok);
779 return NULL;
780 }
781 tok->cur = tok->inp = tok->buf;
782 tok->end = tok->buf + BUFSIZ;
783 tok->fp = fp;
784 tok->prompt = ps1;
785 tok->nextprompt = ps2;
786 if (enc != NULL) {
787 /* Must copy encoding declaration since it
788 gets copied into the parse tree. */
789 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
790 if (!tok->encoding) {
791 PyTokenizer_Free(tok);
792 return NULL;
793 }
794 strcpy(tok->encoding, enc);
795 tok->decoding_state = STATE_NORMAL;
796 }
797 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798}
799
800
801/* Free a tok_state structure */
802
803void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000804PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 if (tok->encoding != NULL)
807 PyMem_FREE(tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 Py_XDECREF(tok->decoding_readline);
809 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200810 Py_XDECREF(tok->filename);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 if (tok->fp != NULL && tok->buf != NULL)
812 PyMem_FREE(tok->buf);
813 if (tok->input)
814 PyMem_FREE((char *)tok->input);
815 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816}
817
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818/* Get next char, updating state; error code goes into tok->done */
819
820static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200821tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 for (;;) {
824 if (tok->cur != tok->inp) {
825 return Py_CHARMASK(*tok->cur++); /* Fast path */
826 }
827 if (tok->done != E_OK)
828 return EOF;
829 if (tok->fp == NULL) {
830 char *end = strchr(tok->inp, '\n');
831 if (end != NULL)
832 end++;
833 else {
834 end = strchr(tok->inp, '\0');
835 if (end == tok->inp) {
836 tok->done = E_EOF;
837 return EOF;
838 }
839 }
840 if (tok->start == NULL)
841 tok->buf = tok->cur;
842 tok->line_start = tok->cur;
843 tok->lineno++;
844 tok->inp = end;
845 return Py_CHARMASK(*tok->cur++);
846 }
847 if (tok->prompt != NULL) {
848 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner89e34362011-01-07 18:47:22 +0000849 if (newtok != NULL) {
850 char *translated = translate_newlines(newtok, 0, tok);
851 PyMem_FREE(newtok);
852 if (translated == NULL)
853 return EOF;
854 newtok = translated;
855 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856 if (tok->encoding && newtok && *newtok) {
857 /* Recode to UTF-8 */
858 Py_ssize_t buflen;
859 const char* buf;
860 PyObject *u = translate_into_utf8(newtok, tok->encoding);
861 PyMem_FREE(newtok);
862 if (!u) {
863 tok->done = E_DECODE;
864 return EOF;
865 }
866 buflen = PyBytes_GET_SIZE(u);
867 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000868 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700869 if (newtok == NULL) {
870 Py_DECREF(u);
871 tok->done = E_NOMEM;
872 return EOF;
873 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000874 strcpy(newtok, buf);
875 Py_DECREF(u);
876 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000877 if (tok->nextprompt != NULL)
878 tok->prompt = tok->nextprompt;
879 if (newtok == NULL)
880 tok->done = E_INTR;
881 else if (*newtok == '\0') {
882 PyMem_FREE(newtok);
883 tok->done = E_EOF;
884 }
885 else if (tok->start != NULL) {
886 size_t start = tok->start - tok->buf;
887 size_t oldlen = tok->cur - tok->buf;
888 size_t newlen = oldlen + strlen(newtok);
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000889 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 char *buf = tok->buf;
891 buf = (char *)PyMem_REALLOC(buf, newlen+1);
892 tok->lineno++;
893 if (buf == NULL) {
894 PyMem_FREE(tok->buf);
895 tok->buf = NULL;
896 PyMem_FREE(newtok);
897 tok->done = E_NOMEM;
898 return EOF;
899 }
900 tok->buf = buf;
901 tok->cur = tok->buf + oldlen;
Pablo Galindo5ec91f72020-01-06 15:59:09 +0000902 tok->multi_line_start = tok->buf + cur_multi_line_start;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903 tok->line_start = tok->cur;
904 strcpy(tok->buf + oldlen, newtok);
905 PyMem_FREE(newtok);
906 tok->inp = tok->buf + newlen;
907 tok->end = tok->inp + 1;
908 tok->start = tok->buf + start;
909 }
910 else {
911 tok->lineno++;
912 if (tok->buf != NULL)
913 PyMem_FREE(tok->buf);
914 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 tok->cur = tok->buf;
916 tok->line_start = tok->buf;
917 tok->inp = strchr(tok->buf, '\0');
918 tok->end = tok->inp + 1;
919 }
920 }
921 else {
922 int done = 0;
923 Py_ssize_t cur = 0;
924 char *pt;
925 if (tok->start == NULL) {
926 if (tok->buf == NULL) {
927 tok->buf = (char *)
928 PyMem_MALLOC(BUFSIZ);
929 if (tok->buf == NULL) {
930 tok->done = E_NOMEM;
931 return EOF;
932 }
933 tok->end = tok->buf + BUFSIZ;
934 }
935 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
936 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200937 if (!tok->decoding_erred)
938 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 done = 1;
940 }
941 else {
942 tok->done = E_OK;
943 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700944 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000945 }
946 }
947 else {
948 cur = tok->cur - tok->buf;
949 if (decoding_feof(tok)) {
950 tok->done = E_EOF;
951 done = 1;
952 }
953 else
954 tok->done = E_OK;
955 }
956 tok->lineno++;
957 /* Read until '\n' or EOF */
958 while (!done) {
959 Py_ssize_t curstart = tok->start == NULL ? -1 :
960 tok->start - tok->buf;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700961 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000962 Py_ssize_t curvalid = tok->inp - tok->buf;
963 Py_ssize_t newsize = curvalid + BUFSIZ;
964 char *newbuf = tok->buf;
965 newbuf = (char *)PyMem_REALLOC(newbuf,
966 newsize);
967 if (newbuf == NULL) {
968 tok->done = E_NOMEM;
969 tok->cur = tok->inp;
970 return EOF;
971 }
972 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200973 tok->cur = tok->buf + cur;
Anthony Sottile5b94f352019-07-29 06:59:13 -0700974 tok->multi_line_start = tok->buf + cur_multi_line_start;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200975 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000976 tok->inp = tok->buf + curvalid;
977 tok->end = tok->buf + newsize;
978 tok->start = curstart < 0 ? NULL :
979 tok->buf + curstart;
980 if (decoding_fgets(tok->inp,
981 (int)(tok->end - tok->inp),
982 tok) == NULL) {
983 /* Break out early on decoding
984 errors, as tok->buf will be NULL
985 */
986 if (tok->decoding_erred)
987 return EOF;
988 /* Last line does not end in \n,
989 fake one */
Anthony Sottileabea73b2019-05-18 11:27:17 -0700990 if (tok->inp[-1] != '\n')
991 strcpy(tok->inp, "\n");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000992 }
993 tok->inp = strchr(tok->inp, '\0');
994 done = tok->inp[-1] == '\n';
995 }
996 if (tok->buf != NULL) {
997 tok->cur = tok->buf + cur;
998 tok->line_start = tok->cur;
999 /* replace "\r\n" with "\n" */
1000 /* For Mac leave the \r, giving a syntax error */
1001 pt = tok->inp - 2;
1002 if (pt >= tok->buf && *pt == '\r') {
1003 *pt++ = '\n';
1004 *pt = '\0';
1005 tok->inp = pt;
1006 }
1007 }
1008 }
1009 if (tok->done != E_OK) {
1010 if (tok->prompt != NULL)
1011 PySys_WriteStderr("\n");
1012 tok->cur = tok->inp;
1013 return EOF;
1014 }
1015 }
1016 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001017}
1018
1019
1020/* Back-up one character */
1021
1022static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001023tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001024{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001025 if (c != EOF) {
1026 if (--tok->cur < tok->buf)
1027 Py_FatalError("tok_backup: beginning of buffer");
1028 if (*tok->cur != c)
1029 *tok->cur = c;
1030 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031}
1032
1033
Guido van Rossum926f13a1998-04-09 21:38:06 +00001034static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001035syntaxerror(struct tok_state *tok, const char *format, ...)
1036{
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001037 va_list vargs;
1038#ifdef HAVE_STDARG_PROTOTYPES
1039 va_start(vargs, format);
1040#else
1041 va_start(vargs);
1042#endif
1043 PyErr_FormatV(PyExc_SyntaxError, format, vargs);
1044 va_end(vargs);
1045 PyErr_SyntaxLocationObject(tok->filename,
1046 tok->lineno,
Victor Stinnerc8846162018-07-21 03:36:06 +02001047 (int)(tok->cur - tok->line_start));
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001048 tok->done = E_ERROR;
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001049 return ERRORTOKEN;
1050}
1051
1052static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001053indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001054{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001055 tok->done = E_TABSPACE;
1056 tok->cur = tok->inp;
1057 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001058}
1059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060/* Verify that the identifier follows PEP 3131.
1061 All identifier strings are guaranteed to be "ready" unicode objects.
1062 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001063static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001064verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001065{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001066 PyObject *s;
1067 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001068 if (tok->decoding_erred)
1069 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001071 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1073 PyErr_Clear();
1074 tok->done = E_IDENTIFIER;
1075 } else {
1076 tok->done = E_ERROR;
1077 }
1078 return 0;
1079 }
1080 result = PyUnicode_IsIdentifier(s);
1081 Py_DECREF(s);
1082 if (result == 0)
1083 tok->done = E_IDENTIFIER;
1084 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001085}
Guido van Rossum926f13a1998-04-09 21:38:06 +00001086
Brett Cannona721aba2016-09-09 14:57:09 -07001087static int
1088tok_decimal_tail(struct tok_state *tok)
1089{
1090 int c;
1091
1092 while (1) {
1093 do {
1094 c = tok_nextc(tok);
1095 } while (isdigit(c));
1096 if (c != '_') {
1097 break;
1098 }
1099 c = tok_nextc(tok);
1100 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001101 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001102 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001103 return 0;
1104 }
1105 }
1106 return c;
1107}
1108
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109/* Get next token, after space stripping etc. */
1110
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001111static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001112tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001113{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001114 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001116
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 tok->start = NULL;
1120 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 /* Get indentation level */
1123 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001124 int col = 0;
1125 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 tok->atbol = 0;
1127 for (;;) {
1128 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001129 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001131 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001132 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001133 col = (col / tok->tabsize + 1) * tok->tabsize;
1134 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 }
Brett Cannona721aba2016-09-09 14:57:09 -07001136 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001137 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001138 }
1139 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001141 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 }
1143 tok_backup(tok, c);
1144 if (c == '#' || c == '\n') {
1145 /* Lines with only whitespace and/or comments
1146 shouldn't affect the indentation and are
1147 not passed to the parser as NEWLINE tokens,
1148 except *totally* empty lines in interactive
1149 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001150 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001152 }
Batuhan Taşkaya109fc272019-12-09 07:36:27 +03001153 else if (tok->prompt != NULL && tok->lineno == 1) {
1154 /* In interactive mode, if the first line contains
1155 only spaces and/or a comment, let it through. */
1156 blankline = 0;
1157 col = altcol = 0;
1158 }
Brett Cannona721aba2016-09-09 14:57:09 -07001159 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001161 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 /* We can't jump back right here since we still
1163 may need to skip to the end of a comment */
1164 }
1165 if (!blankline && tok->level == 0) {
1166 if (col == tok->indstack[tok->indent]) {
1167 /* No change */
1168 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001169 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 }
1171 }
1172 else if (col > tok->indstack[tok->indent]) {
1173 /* Indent -- always one */
1174 if (tok->indent+1 >= MAXINDENT) {
1175 tok->done = E_TOODEEP;
1176 tok->cur = tok->inp;
1177 return ERRORTOKEN;
1178 }
1179 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001180 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 }
1182 tok->pendin++;
1183 tok->indstack[++tok->indent] = col;
1184 tok->altindstack[tok->indent] = altcol;
1185 }
1186 else /* col < tok->indstack[tok->indent] */ {
1187 /* Dedent -- any number, must be consistent */
1188 while (tok->indent > 0 &&
1189 col < tok->indstack[tok->indent]) {
1190 tok->pendin--;
1191 tok->indent--;
1192 }
1193 if (col != tok->indstack[tok->indent]) {
1194 tok->done = E_DEDENT;
1195 tok->cur = tok->inp;
1196 return ERRORTOKEN;
1197 }
1198 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001199 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 }
1201 }
1202 }
1203 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001204
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 /* Return pending indents/dedents */
1208 if (tok->pendin != 0) {
1209 if (tok->pendin < 0) {
1210 tok->pendin++;
1211 return DEDENT;
1212 }
1213 else {
1214 tok->pendin--;
1215 return INDENT;
1216 }
1217 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001218
Guido van Rossum495da292019-03-07 12:38:08 -08001219 /* Peek ahead at the next character */
1220 c = tok_nextc(tok);
1221 tok_backup(tok, c);
1222 /* Check if we are closing an async function */
1223 if (tok->async_def
1224 && !blankline
1225 /* Due to some implementation artifacts of type comments,
1226 * a TYPE_COMMENT at the start of a function won't set an
1227 * indentation level and it will produce a NEWLINE after it.
1228 * To avoid spuriously ending an async function due to this,
1229 * wait until we have some non-newline char in front of us. */
1230 && c != '\n'
1231 && tok->level == 0
1232 /* There was a NEWLINE after ASYNC DEF,
1233 so we're past the signature. */
1234 && tok->async_def_nl
1235 /* Current indentation level is less than where
1236 the async function was defined */
1237 && tok->async_def_indent >= tok->indent)
1238 {
1239 tok->async_def = 0;
1240 tok->async_def_indent = 0;
1241 tok->async_def_nl = 0;
1242 }
1243
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 tok->start = NULL;
1246 /* Skip spaces */
1247 do {
1248 c = tok_nextc(tok);
1249 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001250
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 /* Set start of current token */
1252 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001253
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001254 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001255 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001256 const char *prefix, *p, *type_start;
1257
Brett Cannona721aba2016-09-09 14:57:09 -07001258 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001260 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001261
1262 if (tok->type_comments) {
1263 p = tok->start;
1264 prefix = type_comment_prefix;
1265 while (*prefix && p < tok->cur) {
1266 if (*prefix == ' ') {
1267 while (*p == ' ' || *p == '\t') {
1268 p++;
1269 }
1270 } else if (*prefix == *p) {
1271 p++;
1272 } else {
1273 break;
1274 }
1275
1276 prefix++;
1277 }
1278
1279 /* This is a type comment if we matched all of type_comment_prefix. */
1280 if (!*prefix) {
1281 int is_type_ignore = 1;
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001282 const char *ignore_end = p + 6;
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001283 tok_backup(tok, c); /* don't eat the newline or EOF */
1284
1285 type_start = p;
1286
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001287 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001288 * or anything ASCII and non-alphanumeric. */
Michael J. Sullivand8320ec2019-05-11 11:17:24 -07001289 is_type_ignore = (
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001290 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
Michael J. Sullivand8a82e22019-05-22 13:43:37 -07001291 && !(tok->cur > ignore_end
1292 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001293
1294 if (is_type_ignore) {
Michael J. Sullivan933e1502019-05-22 07:54:20 -07001295 *p_start = (char *) ignore_end;
1296 *p_end = tok->cur;
1297
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001298 /* If this type ignore is the only thing on the line, consume the newline also. */
1299 if (blankline) {
1300 tok_nextc(tok);
1301 tok->atbol = 1;
1302 }
1303 return TYPE_IGNORE;
1304 } else {
1305 *p_start = (char *) type_start; /* after type_comment_prefix */
1306 *p_end = tok->cur;
1307 return TYPE_COMMENT;
1308 }
1309 }
1310 }
Brett Cannona721aba2016-09-09 14:57:09 -07001311 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001312
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 /* Check for EOF and errors now */
1314 if (c == EOF) {
1315 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1316 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001317
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 /* Identifier (most frequent token!) */
1319 nonascii = 0;
1320 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001321 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001322 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001323 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001324 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001325 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001326 /* Since this is a backwards compatibility support literal we don't
1327 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001328 else if (!(saw_b || saw_u || saw_r || saw_f)
1329 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001330 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001331 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001332 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001333 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001334 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001335 }
1336 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001337 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001338 }
1339 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001340 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001341 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001343 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001345 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 }
1347 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001348 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001350 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 c = tok_nextc(tok);
1352 }
1353 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001354 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001356 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 *p_start = tok->start;
1358 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001359
Guido van Rossum495da292019-03-07 12:38:08 -08001360 /* async/await parsing block. */
1361 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1362 /* May be an 'async' or 'await' token. For Python 3.7 or
1363 later we recognize them unconditionally. For Python
1364 3.5 or 3.6 we recognize 'async' in front of 'def', and
1365 either one inside of 'async def'. (Technically we
1366 shouldn't recognize these at all for 3.4 or earlier,
1367 but there's no *valid* Python 3.4 code that would be
1368 rejected, and async functions will be rejected in a
1369 later phase.) */
1370 if (!tok->async_hacks || tok->async_def) {
1371 /* Always recognize the keywords. */
1372 if (memcmp(tok->start, "async", 5) == 0) {
1373 return ASYNC;
1374 }
1375 if (memcmp(tok->start, "await", 5) == 0) {
1376 return AWAIT;
1377 }
1378 }
1379 else if (memcmp(tok->start, "async", 5) == 0) {
1380 /* The current token is 'async'.
1381 Look ahead one token to see if that is 'def'. */
1382
1383 struct tok_state ahead_tok;
1384 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1385 int ahead_tok_kind;
1386
1387 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1388 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1389 &ahead_tok_end);
1390
1391 if (ahead_tok_kind == NAME
1392 && ahead_tok.cur - ahead_tok.start == 3
1393 && memcmp(ahead_tok.start, "def", 3) == 0)
1394 {
1395 /* The next token is going to be 'def', so instead of
1396 returning a plain NAME token, return ASYNC. */
1397 tok->async_def_indent = tok->indent;
1398 tok->async_def = 1;
1399 return ASYNC;
1400 }
1401 }
1402 }
1403
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 return NAME;
1405 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 /* Newline */
1408 if (c == '\n') {
1409 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001410 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001412 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 *p_start = tok->start;
1414 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1415 tok->cont_line = 0;
Guido van Rossum495da292019-03-07 12:38:08 -08001416 if (tok->async_def) {
1417 /* We're somewhere inside an 'async def' function, and
1418 we've encountered a NEWLINE after its signature. */
1419 tok->async_def_nl = 1;
1420 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001421 return NEWLINE;
1422 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 /* Period or number starting with period? */
1425 if (c == '.') {
1426 c = tok_nextc(tok);
1427 if (isdigit(c)) {
1428 goto fraction;
1429 } else if (c == '.') {
1430 c = tok_nextc(tok);
1431 if (c == '.') {
1432 *p_start = tok->start;
1433 *p_end = tok->cur;
1434 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001435 }
1436 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 tok_backup(tok, c);
1438 }
1439 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001440 }
1441 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 tok_backup(tok, c);
1443 }
1444 *p_start = tok->start;
1445 *p_end = tok->cur;
1446 return DOT;
1447 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001448
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 /* Number */
1450 if (isdigit(c)) {
1451 if (c == '0') {
1452 /* Hex, octal or binary -- maybe. */
1453 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001455 /* Hex */
1456 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001457 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001458 if (c == '_') {
1459 c = tok_nextc(tok);
1460 }
1461 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001462 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001463 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001464 }
1465 do {
1466 c = tok_nextc(tok);
1467 } while (isxdigit(c));
1468 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 }
1470 else if (c == 'o' || c == 'O') {
1471 /* Octal */
1472 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001474 if (c == '_') {
1475 c = tok_nextc(tok);
1476 }
1477 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001478 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001479 if (isdigit(c)) {
1480 return syntaxerror(tok,
1481 "invalid digit '%c' in octal literal", c);
1482 }
1483 else {
1484 return syntaxerror(tok, "invalid octal literal");
1485 }
Brett Cannona721aba2016-09-09 14:57:09 -07001486 }
1487 do {
1488 c = tok_nextc(tok);
1489 } while ('0' <= c && c < '8');
1490 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001491 if (isdigit(c)) {
1492 return syntaxerror(tok,
1493 "invalid digit '%c' in octal literal", c);
1494 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 }
1496 else if (c == 'b' || c == 'B') {
1497 /* Binary */
1498 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001500 if (c == '_') {
1501 c = tok_nextc(tok);
1502 }
1503 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001504 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001505 if (isdigit(c)) {
1506 return syntaxerror(tok,
1507 "invalid digit '%c' in binary literal", c);
1508 }
1509 else {
1510 return syntaxerror(tok, "invalid binary literal");
1511 }
Brett Cannona721aba2016-09-09 14:57:09 -07001512 }
1513 do {
1514 c = tok_nextc(tok);
1515 } while (c == '0' || c == '1');
1516 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001517 if (isdigit(c)) {
1518 return syntaxerror(tok,
1519 "invalid digit '%c' in binary literal", c);
1520 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001521 }
1522 else {
1523 int nonzero = 0;
1524 /* maybe old-style octal; c is first char of it */
1525 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001526 while (1) {
1527 if (c == '_') {
1528 c = tok_nextc(tok);
1529 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001530 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001531 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001532 }
1533 }
1534 if (c != '0') {
1535 break;
1536 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 c = tok_nextc(tok);
1538 }
Brett Cannona721aba2016-09-09 14:57:09 -07001539 if (isdigit(c)) {
1540 nonzero = 1;
1541 c = tok_decimal_tail(tok);
1542 if (c == 0) {
1543 return ERRORTOKEN;
1544 }
1545 }
1546 if (c == '.') {
1547 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001548 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001549 }
1550 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001552 }
1553 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001555 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001556 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001557 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001559 return syntaxerror(tok,
1560 "leading zeros in decimal integer "
1561 "literals are not permitted; "
1562 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 }
1564 }
1565 }
1566 else {
1567 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001568 c = tok_decimal_tail(tok);
1569 if (c == 0) {
1570 return ERRORTOKEN;
1571 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001572 {
1573 /* Accept floating point numbers. */
1574 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001575 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001576 fraction:
1577 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001578 if (isdigit(c)) {
1579 c = tok_decimal_tail(tok);
1580 if (c == 0) {
1581 return ERRORTOKEN;
1582 }
1583 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 }
1585 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001586 int e;
1587 exponent:
1588 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 /* Exponent part */
1590 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001591 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001593 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001594 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001595 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001596 }
1597 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001599 tok_backup(tok, e);
1600 *p_start = tok->start;
1601 *p_end = tok->cur;
1602 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 }
Brett Cannona721aba2016-09-09 14:57:09 -07001604 c = tok_decimal_tail(tok);
1605 if (c == 0) {
1606 return ERRORTOKEN;
1607 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001608 }
Brett Cannona721aba2016-09-09 14:57:09 -07001609 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 /* Imaginary part */
1611 imaginary:
1612 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001613 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001614 }
1615 }
1616 tok_backup(tok, c);
1617 *p_start = tok->start;
1618 *p_end = tok->cur;
1619 return NUMBER;
1620 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001621
1622 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 /* String */
1624 if (c == '\'' || c == '"') {
1625 int quote = c;
1626 int quote_size = 1; /* 1 or 3 */
1627 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001628
Anthony Sottile995d9b92019-01-12 20:05:13 -08001629 /* Nodes of type STRING, especially multi line strings
1630 must be handled differently in order to get both
1631 the starting line number and the column offset right.
1632 (cf. issue 16806) */
1633 tok->first_lineno = tok->lineno;
1634 tok->multi_line_start = tok->line_start;
1635
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 /* Find the quote size and start of string */
1637 c = tok_nextc(tok);
1638 if (c == quote) {
1639 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001640 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001642 }
1643 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001645 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 }
Brett Cannona721aba2016-09-09 14:57:09 -07001647 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001649 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001650
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 /* Get rest of string */
1652 while (end_quote_size != quote_size) {
1653 c = tok_nextc(tok);
1654 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001655 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001656 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001657 }
1658 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001659 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001660 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001661 tok->cur = tok->inp;
1662 return ERRORTOKEN;
1663 }
1664 if (quote_size == 1 && c == '\n') {
1665 tok->done = E_EOLS;
1666 tok->cur = tok->inp;
1667 return ERRORTOKEN;
1668 }
Brett Cannona721aba2016-09-09 14:57:09 -07001669 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001671 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001672 else {
1673 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001674 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001675 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001676 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 }
1678 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001679
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680 *p_start = tok->start;
1681 *p_end = tok->cur;
1682 return STRING;
1683 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001684
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 /* Line continuation */
1686 if (c == '\\') {
1687 c = tok_nextc(tok);
1688 if (c != '\n') {
1689 tok->done = E_LINECONT;
1690 tok->cur = tok->inp;
1691 return ERRORTOKEN;
1692 }
Anthony Sottileabea73b2019-05-18 11:27:17 -07001693 c = tok_nextc(tok);
1694 if (c == EOF) {
1695 tok->done = E_EOF;
1696 tok->cur = tok->inp;
1697 return ERRORTOKEN;
1698 } else {
1699 tok_backup(tok, c);
1700 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 tok->cont_line = 1;
1702 goto again; /* Read next line */
1703 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001704
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 /* Check for two-character token */
1706 {
1707 int c2 = tok_nextc(tok);
1708 int token = PyToken_TwoChars(c, c2);
1709 if (token != OP) {
1710 int c3 = tok_nextc(tok);
1711 int token3 = PyToken_ThreeChars(c, c2, c3);
1712 if (token3 != OP) {
1713 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001714 }
1715 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001716 tok_backup(tok, c3);
1717 }
1718 *p_start = tok->start;
1719 *p_end = tok->cur;
1720 return token;
1721 }
1722 tok_backup(tok, c2);
1723 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001724
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001725 /* Keep track of parentheses nesting level */
1726 switch (c) {
1727 case '(':
1728 case '[':
1729 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001730 if (tok->level >= MAXLEVEL) {
1731 return syntaxerror(tok, "too many nested parentheses");
1732 }
1733 tok->parenstack[tok->level] = c;
1734 tok->parenlinenostack[tok->level] = tok->lineno;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 tok->level++;
1736 break;
1737 case ')':
1738 case ']':
1739 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001740 if (!tok->level) {
1741 return syntaxerror(tok, "unmatched '%c'", c);
1742 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001743 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001744 int opening = tok->parenstack[tok->level];
1745 if (!((opening == '(' && c == ')') ||
1746 (opening == '[' && c == ']') ||
1747 (opening == '{' && c == '}')))
1748 {
1749 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1750 return syntaxerror(tok,
1751 "closing parenthesis '%c' does not match "
1752 "opening parenthesis '%c' on line %d",
1753 c, opening, tok->parenlinenostack[tok->level]);
1754 }
1755 else {
1756 return syntaxerror(tok,
1757 "closing parenthesis '%c' does not match "
1758 "opening parenthesis '%c'",
1759 c, opening);
1760 }
1761 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001762 break;
1763 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001764
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 /* Punctuation character */
1766 *p_start = tok->start;
1767 *p_end = tok->cur;
1768 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001769}
1770
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001771int
1772PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1773{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 int result = tok_get(tok, p_start, p_end);
1775 if (tok->decoding_erred) {
1776 result = ERRORTOKEN;
1777 tok->done = E_DECODE;
1778 }
1779 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001780}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001781
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001782/* Get the encoding of a Python file. Check for the coding cookie and check if
1783 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001784
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001785 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1786 encoding in the first or second line of the file (in which case the encoding
1787 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001788
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001789 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1790 by the caller. */
1791
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001792char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001793PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001794{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001795 struct tok_state *tok;
1796 FILE *fp;
1797 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001798
Victor Stinnerdaf45552013-08-28 00:53:59 +02001799 fd = _Py_dup(fd);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001800 if (fd < 0) {
1801 return NULL;
1802 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001803
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001804 fp = fdopen(fd, "r");
1805 if (fp == NULL) {
1806 return NULL;
1807 }
1808 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1809 if (tok == NULL) {
1810 fclose(fp);
1811 return NULL;
1812 }
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001813 if (filename != NULL) {
1814 Py_INCREF(filename);
1815 tok->filename = filename;
1816 }
1817 else {
1818 tok->filename = PyUnicode_FromString("<string>");
1819 if (tok->filename == NULL) {
1820 fclose(fp);
1821 PyTokenizer_Free(tok);
1822 return encoding;
1823 }
1824 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001825 while (tok->lineno < 2 && tok->done == E_OK) {
1826 PyTokenizer_Get(tok, &p_start, &p_end);
1827 }
1828 fclose(fp);
1829 if (tok->encoding) {
1830 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1831 if (encoding)
Hansraj Das69f37bc2019-08-15 21:49:07 +05301832 strcpy(encoding, tok->encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001833 }
1834 PyTokenizer_Free(tok);
1835 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001836}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001837
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001838char *
1839PyTokenizer_FindEncoding(int fd)
1840{
1841 return PyTokenizer_FindEncodingFilename(fd, NULL);
1842}
1843
Guido van Rossum408027e1996-12-30 16:17:54 +00001844#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001845
1846void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001847tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001848{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001849 printf("%s", _PyParser_TokenNames[type]);
1850 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1851 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001852}
1853
1854#endif