blob: 3e3cf2cd7f582a6e34674cb9ac9f465d1ba74113 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080021/* Alternate tab spacing */
22#define ALTTABSIZE 1
23
Martin v. Löwis5b222132007-06-10 09:51:05 +000024#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000025 (c >= 'a' && c <= 'z')\
26 || (c >= 'A' && c <= 'Z')\
27 || c == '_'\
28 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000029
30#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 (c >= 'a' && c <= 'z')\
32 || (c >= 'A' && c <= 'Z')\
33 || (c >= '0' && c <= '9')\
34 || c == '_'\
35 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000036
Serhiy Storchakac6792272013-10-19 21:03:34 +030037extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000038/* Return malloc'ed string including trailing \n;
39 empty malloc'ed string for EOF;
40 NULL if interrupted */
41
Guido van Rossum4fe87291992-02-26 15:24:44 +000042/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044
Guido van Rossum3f5da241990-12-20 15:06:42 +000045/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000046static struct tok_state *tok_new(void);
47static int tok_nextc(struct tok_state *tok);
48static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000049
Brett Cannond5ec98c2007-10-20 02:54:14 +000050
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051/* Create and initialize a new tok_state structure */
52
53static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000054tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000056 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
57 sizeof(struct tok_state));
58 if (tok == NULL)
59 return NULL;
60 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
61 tok->done = E_OK;
62 tok->fp = NULL;
63 tok->input = NULL;
64 tok->tabsize = TABSIZE;
65 tok->indent = 0;
66 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040067
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000068 tok->atbol = 1;
69 tok->pendin = 0;
70 tok->prompt = tok->nextprompt = NULL;
71 tok->lineno = 0;
72 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000073 tok->altindstack[0] = 0;
74 tok->decoding_state = STATE_INIT;
75 tok->decoding_erred = 0;
76 tok->read_coding_spec = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000080#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +020081 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000084#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +030085
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000086 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000087}
88
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000089static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070090new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000091{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000092 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070093 if (!result) {
94 tok->done = E_NOMEM;
95 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000096 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 memcpy(result, s, len);
98 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000100}
101
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000102#ifdef PGEN
103
104static char *
105decoding_fgets(char *s, int size, struct tok_state *tok)
106{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000108}
109
110static int
111decoding_feof(struct tok_state *tok)
112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000114}
115
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000116static char *
117decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000118{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700119 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000120}
121
122#else /* PGEN */
123
124static char *
125error_ret(struct tok_state *tok) /* XXX */
126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 tok->decoding_erred = 1;
128 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
129 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200130 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
131 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133}
134
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200136static const char *
137get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 char buf[13];
140 int i;
141 for (i = 0; i < 12; i++) {
142 int c = s[i];
143 if (c == '\0')
144 break;
145 else if (c == '_')
146 buf[i] = '-';
147 else
148 buf[i] = tolower(c);
149 }
150 buf[i] = '\0';
151 if (strcmp(buf, "utf-8") == 0 ||
152 strncmp(buf, "utf-8-", 6) == 0)
153 return "utf-8";
154 else if (strcmp(buf, "latin-1") == 0 ||
155 strcmp(buf, "iso-8859-1") == 0 ||
156 strcmp(buf, "iso-latin-1") == 0 ||
157 strncmp(buf, "latin-1-", 8) == 0 ||
158 strncmp(buf, "iso-8859-1-", 11) == 0 ||
159 strncmp(buf, "iso-latin-1-", 12) == 0)
160 return "iso-8859-1";
161 else
162 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165/* Return the coding spec in S, or NULL if none is found. */
166
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700167static int
168get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700171 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 /* Coding spec must be in a comment, and that comment must be
173 * the only statement on the source code line. */
174 for (i = 0; i < size - 6; i++) {
175 if (s[i] == '#')
176 break;
177 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700178 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 }
180 for (; i < size - 6; i++) { /* XXX inefficient search */
181 const char* t = s + i;
182 if (strncmp(t, "coding", 6) == 0) {
183 const char* begin = NULL;
184 t += 6;
185 if (t[0] != ':' && t[0] != '=')
186 continue;
187 do {
188 t++;
189 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 begin = t;
192 while (Py_ISALNUM(t[0]) ||
193 t[0] == '-' || t[0] == '_' || t[0] == '.')
194 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200198 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700199 if (!r)
200 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700201 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000202 if (r != q) {
203 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700204 r = new_string(q, strlen(q), tok);
205 if (!r)
206 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700208 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200209 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 }
211 }
212 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214}
215
216/* Check whether the line contains a coding spec. If it does,
217 invoke the set_readline function for the new encoding.
218 This function receives the tok_state and the new encoding.
219 Return 1 on success, 0 on failure. */
220
221static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000222check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700225 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000227
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200228 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200230 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200232 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (!get_coding_spec(line, &cs, size, tok))
234 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200235 if (!cs) {
236 Py_ssize_t i;
237 for (i = 0; i < size; i++) {
238 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
239 break;
240 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
241 /* Stop checking coding spec after a line containing
242 * anything except a comment. */
243 tok->read_coding_spec = 1;
244 break;
245 }
246 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200248 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700249 tok->read_coding_spec = 1;
250 if (tok->encoding == NULL) {
251 assert(tok->decoding_state == STATE_RAW);
252 if (strcmp(cs, "utf-8") == 0) {
253 tok->encoding = cs;
254 } else {
255 r = set_readline(tok, cs);
256 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700260 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300261 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700262 "encoding problem: %s", cs);
263 PyMem_FREE(cs);
264 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 } else { /* then, compare cs with BOM */
267 r = (strcmp(tok->encoding, cs) == 0);
268 if (!r)
269 PyErr_Format(PyExc_SyntaxError,
270 "encoding problem: %s with BOM", cs);
271 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000272 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274}
275
276/* See whether the file starts with a BOM. If it does,
277 invoke the set_readline function with the new encoding.
278 Return 1 on success, 0 on failure. */
279
280static int
281check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 void unget_char(int, struct tok_state *),
283 int set_readline(struct tok_state *, const char *),
284 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 int ch1, ch2, ch3;
287 ch1 = get_char(tok);
288 tok->decoding_state = STATE_RAW;
289 if (ch1 == EOF) {
290 return 1;
291 } else if (ch1 == 0xEF) {
292 ch2 = get_char(tok);
293 if (ch2 != 0xBB) {
294 unget_char(ch2, tok);
295 unget_char(ch1, tok);
296 return 1;
297 }
298 ch3 = get_char(tok);
299 if (ch3 != 0xBF) {
300 unget_char(ch3, tok);
301 unget_char(ch2, tok);
302 unget_char(ch1, tok);
303 return 1;
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 /* Disable support for UTF-16 BOMs until a decision
307 is made whether this needs to be supported. */
308 } else if (ch1 == 0xFE) {
309 ch2 = get_char(tok);
310 if (ch2 != 0xFF) {
311 unget_char(ch2, tok);
312 unget_char(ch1, tok);
313 return 1;
314 }
315 if (!set_readline(tok, "utf-16-be"))
316 return 0;
317 tok->decoding_state = STATE_NORMAL;
318 } else if (ch1 == 0xFF) {
319 ch2 = get_char(tok);
320 if (ch2 != 0xFE) {
321 unget_char(ch2, tok);
322 unget_char(ch1, tok);
323 return 1;
324 }
325 if (!set_readline(tok, "utf-16-le"))
326 return 0;
327 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 } else {
330 unget_char(ch1, tok);
331 return 1;
332 }
333 if (tok->encoding != NULL)
334 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700335 tok->encoding = new_string("utf-8", 5, tok);
336 if (!tok->encoding)
337 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 /* No need to set_readline: input is already utf-8 */
339 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340}
341
342/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000344
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000345 On entry, tok->decoding_buffer will be one of:
346 1) NULL: need to call tok->decoding_readline to get a new line
347 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000349 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 (in the s buffer) to copy entire contents of the line read
351 by tok->decoding_readline. tok->decoding_buffer has the overflow.
352 In this case, fp_readl is called in a loop (with an expanded buffer)
353 until the buffer ends with a '\n' (or until the end of the file is
354 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356
357static char *
358fp_readl(char *s, int size, struct tok_state *tok)
359{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 PyObject* bufobj;
361 const char *buf;
362 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Ask for one less byte so we can terminate it */
365 assert(size > 0);
366 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000368 if (tok->decoding_buffer) {
369 bufobj = tok->decoding_buffer;
370 Py_INCREF(bufobj);
371 }
372 else
373 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100374 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 if (bufobj == NULL)
376 goto error;
377 }
378 if (PyUnicode_CheckExact(bufobj))
379 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200380 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 if (buf == NULL) {
382 goto error;
383 }
384 }
385 else
386 {
387 buf = PyByteArray_AsString(bufobj);
388 if (buf == NULL) {
389 goto error;
390 }
391 buflen = PyByteArray_GET_SIZE(bufobj);
392 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 Py_XDECREF(tok->decoding_buffer);
395 if (buflen > size) {
396 /* Too many chars, the rest goes into tok->decoding_buffer */
397 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
398 buflen-size);
399 if (tok->decoding_buffer == NULL)
400 goto error;
401 buflen = size;
402 }
403 else
404 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000405
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 memcpy(s, buf, buflen);
407 s[buflen] = '\0';
408 if (buflen == 0) /* EOF */
409 s = NULL;
410 Py_DECREF(bufobj);
411 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000412
413error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000414 Py_XDECREF(bufobj);
415 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000416}
417
418/* Set the readline function for TOK to a StreamReader's
419 readline function. The StreamReader is named ENC.
420
421 This function is called from check_bom and check_coding_spec.
422
423 ENC is usually identical to the future value of tok->encoding,
424 except for the (currently unsupported) case of UTF-16.
425
426 Return 1 on success, 0 on failure. */
427
428static int
429fp_setreadl(struct tok_state *tok, const char* enc)
430{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700431 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200432 _Py_IDENTIFIER(open);
433 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000434 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200435 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
Victor Stinner22a351a2010-10-14 12:04:34 +0000437 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200438 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100439 * position of tok->fp. If tok->fp was opened in text mode on Windows,
440 * its file position counts CRLF as one char and can't be directly mapped
441 * to the file offset for fd. Instead we step back one byte and read to
442 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200443 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100444 if (pos == -1 ||
445 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000446 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700447 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000448 }
449
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700450 io = PyImport_ImportModuleNoBlock("io");
451 if (io == NULL)
452 return 0;
453
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200454 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000455 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000457 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700458 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200460 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 Py_DECREF(stream);
462 if (readline == NULL)
463 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300464 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700465
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100466 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100467 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700468 if (bufobj == NULL)
469 return 0;
470 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100471 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700473 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Fetch the next byte from TOK. */
477
478static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480}
481
482/* Unfetch the last byte back into TOK. */
483
484static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000485 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486}
487
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000488/* Check whether the characters at s start a valid
489 UTF-8 sequence. Return the number of characters forming
490 the sequence if yes, 0 if not. */
491static int valid_utf8(const unsigned char* s)
492{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000493 int expected = 0;
494 int length;
495 if (*s < 0x80)
496 /* single-byte code */
497 return 1;
498 if (*s < 0xc0)
499 /* following byte */
500 return 0;
501 if (*s < 0xE0)
502 expected = 1;
503 else if (*s < 0xF0)
504 expected = 2;
505 else if (*s < 0xF8)
506 expected = 3;
507 else
508 return 0;
509 length = expected + 1;
510 for (; expected; expected--)
511 if (s[expected] < 0x80 || s[expected] >= 0xC0)
512 return 0;
513 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000514}
515
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516/* Read a line of input from TOK. Determine encoding
517 if necessary. */
518
519static char *
520decoding_fgets(char *s, int size, struct tok_state *tok)
521{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 char *line = NULL;
523 int badchar = 0;
524 for (;;) {
525 if (tok->decoding_state == STATE_NORMAL) {
526 /* We already have a codec associated with
527 this input. */
528 line = fp_readl(s, size, tok);
529 break;
530 } else if (tok->decoding_state == STATE_RAW) {
531 /* We want a 'raw' read. */
532 line = Py_UniversalNewlineFgets(s, size,
533 tok->fp, NULL);
534 break;
535 } else {
536 /* We have not yet determined the encoding.
537 If an encoding is found, use the file-pointer
538 reader functions from now on. */
539 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
540 return error_ret(tok);
541 assert(tok->decoding_state != STATE_INIT);
542 }
543 }
544 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
545 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
546 return error_ret(tok);
547 }
548 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 /* The default encoding is UTF-8, so make sure we don't have any
551 non-UTF-8 sequences in it. */
552 if (line && !tok->encoding) {
553 unsigned char *c;
554 int length;
555 for (c = (unsigned char *)line; *c; c += length)
556 if (!(length = valid_utf8(c))) {
557 badchar = *c;
558 break;
559 }
560 }
561 if (badchar) {
562 /* Need to add 1 to the line number, since this line
563 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200564 PyErr_Format(PyExc_SyntaxError,
565 "Non-UTF-8 code starting with '\\x%.2x' "
566 "in file %U on line %i, "
567 "but no encoding declared; "
568 "see http://python.org/dev/peps/pep-0263/ for details",
569 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000570 return error_ret(tok);
571 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000573 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574}
575
576static int
577decoding_feof(struct tok_state *tok)
578{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 if (tok->decoding_state != STATE_NORMAL) {
580 return feof(tok->fp);
581 } else {
582 PyObject* buf = tok->decoding_buffer;
583 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100584 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000585 if (buf == NULL) {
586 error_ret(tok);
587 return 1;
588 } else {
589 tok->decoding_buffer = buf;
590 }
591 }
592 return PyObject_Length(buf) == 0;
593 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594}
595
596/* Fetch a byte from TOK, using the string buffer. */
597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000598static int
599buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000600 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Unfetch a byte from TOK, using the string buffer. */
604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605static void
606buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 tok->str--;
608 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609}
610
611/* Set the readline function for TOK to ENC. For the string-based
612 tokenizer, this means to just record the encoding. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 tok->enc = enc;
617 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
620/* Return a UTF-8 encoding Python string object from the
621 C byte string STR, which is encoded with ENC. */
622
623static PyObject *
624translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 PyObject *utf8;
626 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
627 if (buf == NULL)
628 return NULL;
629 utf8 = PyUnicode_AsUTF8String(buf);
630 Py_DECREF(buf);
631 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000634
635static char *
636translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200637 int skip_next_lf = 0;
638 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 char *buf, *current;
640 char c = '\0';
641 buf = PyMem_MALLOC(needed_length);
642 if (buf == NULL) {
643 tok->done = E_NOMEM;
644 return NULL;
645 }
646 for (current = buf; *s; s++, current++) {
647 c = *s;
648 if (skip_next_lf) {
649 skip_next_lf = 0;
650 if (c == '\n') {
651 c = *++s;
652 if (!c)
653 break;
654 }
655 }
656 if (c == '\r') {
657 skip_next_lf = 1;
658 c = '\n';
659 }
660 *current = c;
661 }
662 /* If this is exec input, add a newline to the end of the string if
663 there isn't one already. */
664 if (exec_input && c != '\n') {
665 *current = '\n';
666 current++;
667 }
668 *current = '\0';
669 final_length = current - buf + 1;
670 if (final_length < needed_length && final_length)
671 /* should never fail */
672 buf = PyMem_REALLOC(buf, final_length);
673 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000674}
675
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676/* Decode a byte string STR for use as the buffer of TOK.
677 Look for encoding declarations inside STR, and record them
678 inside TOK. */
679
680static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000681decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 PyObject* utf8 = NULL;
684 const char *str;
685 const char *s;
686 const char *newl[2] = {NULL, NULL};
687 int lineno = 0;
688 tok->input = str = translate_newlines(input, single, tok);
689 if (str == NULL)
690 return NULL;
691 tok->enc = NULL;
692 tok->str = str;
693 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
694 return error_ret(tok);
695 str = tok->str; /* string after BOM if any */
696 assert(str);
697 if (tok->enc != NULL) {
698 utf8 = translate_into_utf8(str, tok->enc);
699 if (utf8 == NULL)
700 return error_ret(tok);
701 str = PyBytes_AsString(utf8);
702 }
703 for (s = str;; s++) {
704 if (*s == '\0') break;
705 else if (*s == '\n') {
706 assert(lineno < 2);
707 newl[lineno] = s;
708 lineno++;
709 if (lineno == 2) break;
710 }
711 }
712 tok->enc = NULL;
713 /* need to check line 1 and 2 separately since check_coding_spec
714 assumes a single line as input */
715 if (newl[0]) {
716 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
717 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200718 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
720 tok, buf_setreadl))
721 return error_ret(tok);
722 }
723 }
724 if (tok->enc != NULL) {
725 assert(utf8 == NULL);
726 utf8 = translate_into_utf8(str, tok->enc);
727 if (utf8 == NULL)
728 return error_ret(tok);
729 str = PyBytes_AS_STRING(utf8);
730 }
731 assert(tok->decoding_buffer == NULL);
732 tok->decoding_buffer = utf8; /* CAUTION */
733 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734}
735
736#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737
738/* Set up tokenizer for string */
739
740struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000741PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 struct tok_state *tok = tok_new();
744 if (tok == NULL)
745 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300746 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 if (str == NULL) {
748 PyTokenizer_Free(tok);
749 return NULL;
750 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000751
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 /* XXX: constify members. */
753 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
754 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755}
756
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000757struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000758PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000759{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 struct tok_state *tok = tok_new();
761 if (tok == NULL)
762 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000765#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 if (str == NULL) {
767 PyTokenizer_Free(tok);
768 return NULL;
769 }
770 tok->decoding_state = STATE_RAW;
771 tok->read_coding_spec = 1;
772 tok->enc = NULL;
773 tok->str = str;
774 tok->encoding = (char *)PyMem_MALLOC(6);
775 if (!tok->encoding) {
776 PyTokenizer_Free(tok);
777 return NULL;
778 }
779 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000780
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 /* XXX: constify members. */
782 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
783 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000784}
785
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000786/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787
788struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300789PyTokenizer_FromFile(FILE *fp, const char* enc,
790 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 struct tok_state *tok = tok_new();
793 if (tok == NULL)
794 return NULL;
795 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
796 PyTokenizer_Free(tok);
797 return NULL;
798 }
799 tok->cur = tok->inp = tok->buf;
800 tok->end = tok->buf + BUFSIZ;
801 tok->fp = fp;
802 tok->prompt = ps1;
803 tok->nextprompt = ps2;
804 if (enc != NULL) {
805 /* Must copy encoding declaration since it
806 gets copied into the parse tree. */
807 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
808 if (!tok->encoding) {
809 PyTokenizer_Free(tok);
810 return NULL;
811 }
812 strcpy(tok->encoding, enc);
813 tok->decoding_state = STATE_NORMAL;
814 }
815 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816}
817
818
819/* Free a tok_state structure */
820
821void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000822PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 if (tok->encoding != NULL)
825 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000826#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 Py_XDECREF(tok->decoding_readline);
828 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200829 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000830#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 if (tok->fp != NULL && tok->buf != NULL)
832 PyMem_FREE(tok->buf);
833 if (tok->input)
834 PyMem_FREE((char *)tok->input);
835 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836}
837
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838/* Get next char, updating state; error code goes into tok->done */
839
840static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200841tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 for (;;) {
844 if (tok->cur != tok->inp) {
845 return Py_CHARMASK(*tok->cur++); /* Fast path */
846 }
847 if (tok->done != E_OK)
848 return EOF;
849 if (tok->fp == NULL) {
850 char *end = strchr(tok->inp, '\n');
851 if (end != NULL)
852 end++;
853 else {
854 end = strchr(tok->inp, '\0');
855 if (end == tok->inp) {
856 tok->done = E_EOF;
857 return EOF;
858 }
859 }
860 if (tok->start == NULL)
861 tok->buf = tok->cur;
862 tok->line_start = tok->cur;
863 tok->lineno++;
864 tok->inp = end;
865 return Py_CHARMASK(*tok->cur++);
866 }
867 if (tok->prompt != NULL) {
868 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000869#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000870 if (newtok != NULL) {
871 char *translated = translate_newlines(newtok, 0, tok);
872 PyMem_FREE(newtok);
873 if (translated == NULL)
874 return EOF;
875 newtok = translated;
876 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000877 if (tok->encoding && newtok && *newtok) {
878 /* Recode to UTF-8 */
879 Py_ssize_t buflen;
880 const char* buf;
881 PyObject *u = translate_into_utf8(newtok, tok->encoding);
882 PyMem_FREE(newtok);
883 if (!u) {
884 tok->done = E_DECODE;
885 return EOF;
886 }
887 buflen = PyBytes_GET_SIZE(u);
888 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700890 if (newtok == NULL) {
891 Py_DECREF(u);
892 tok->done = E_NOMEM;
893 return EOF;
894 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000895 strcpy(newtok, buf);
896 Py_DECREF(u);
897 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000898#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 if (tok->nextprompt != NULL)
900 tok->prompt = tok->nextprompt;
901 if (newtok == NULL)
902 tok->done = E_INTR;
903 else if (*newtok == '\0') {
904 PyMem_FREE(newtok);
905 tok->done = E_EOF;
906 }
907 else if (tok->start != NULL) {
908 size_t start = tok->start - tok->buf;
909 size_t oldlen = tok->cur - tok->buf;
910 size_t newlen = oldlen + strlen(newtok);
911 char *buf = tok->buf;
912 buf = (char *)PyMem_REALLOC(buf, newlen+1);
913 tok->lineno++;
914 if (buf == NULL) {
915 PyMem_FREE(tok->buf);
916 tok->buf = NULL;
917 PyMem_FREE(newtok);
918 tok->done = E_NOMEM;
919 return EOF;
920 }
921 tok->buf = buf;
922 tok->cur = tok->buf + oldlen;
923 tok->line_start = tok->cur;
924 strcpy(tok->buf + oldlen, newtok);
925 PyMem_FREE(newtok);
926 tok->inp = tok->buf + newlen;
927 tok->end = tok->inp + 1;
928 tok->start = tok->buf + start;
929 }
930 else {
931 tok->lineno++;
932 if (tok->buf != NULL)
933 PyMem_FREE(tok->buf);
934 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 tok->cur = tok->buf;
936 tok->line_start = tok->buf;
937 tok->inp = strchr(tok->buf, '\0');
938 tok->end = tok->inp + 1;
939 }
940 }
941 else {
942 int done = 0;
943 Py_ssize_t cur = 0;
944 char *pt;
945 if (tok->start == NULL) {
946 if (tok->buf == NULL) {
947 tok->buf = (char *)
948 PyMem_MALLOC(BUFSIZ);
949 if (tok->buf == NULL) {
950 tok->done = E_NOMEM;
951 return EOF;
952 }
953 tok->end = tok->buf + BUFSIZ;
954 }
955 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
956 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200957 if (!tok->decoding_erred)
958 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000959 done = 1;
960 }
961 else {
962 tok->done = E_OK;
963 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700964 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000965 }
966 }
967 else {
968 cur = tok->cur - tok->buf;
969 if (decoding_feof(tok)) {
970 tok->done = E_EOF;
971 done = 1;
972 }
973 else
974 tok->done = E_OK;
975 }
976 tok->lineno++;
977 /* Read until '\n' or EOF */
978 while (!done) {
979 Py_ssize_t curstart = tok->start == NULL ? -1 :
980 tok->start - tok->buf;
981 Py_ssize_t curvalid = tok->inp - tok->buf;
982 Py_ssize_t newsize = curvalid + BUFSIZ;
983 char *newbuf = tok->buf;
984 newbuf = (char *)PyMem_REALLOC(newbuf,
985 newsize);
986 if (newbuf == NULL) {
987 tok->done = E_NOMEM;
988 tok->cur = tok->inp;
989 return EOF;
990 }
991 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200992 tok->cur = tok->buf + cur;
993 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000994 tok->inp = tok->buf + curvalid;
995 tok->end = tok->buf + newsize;
996 tok->start = curstart < 0 ? NULL :
997 tok->buf + curstart;
998 if (decoding_fgets(tok->inp,
999 (int)(tok->end - tok->inp),
1000 tok) == NULL) {
1001 /* Break out early on decoding
1002 errors, as tok->buf will be NULL
1003 */
1004 if (tok->decoding_erred)
1005 return EOF;
1006 /* Last line does not end in \n,
1007 fake one */
1008 strcpy(tok->inp, "\n");
1009 }
1010 tok->inp = strchr(tok->inp, '\0');
1011 done = tok->inp[-1] == '\n';
1012 }
1013 if (tok->buf != NULL) {
1014 tok->cur = tok->buf + cur;
1015 tok->line_start = tok->cur;
1016 /* replace "\r\n" with "\n" */
1017 /* For Mac leave the \r, giving a syntax error */
1018 pt = tok->inp - 2;
1019 if (pt >= tok->buf && *pt == '\r') {
1020 *pt++ = '\n';
1021 *pt = '\0';
1022 tok->inp = pt;
1023 }
1024 }
1025 }
1026 if (tok->done != E_OK) {
1027 if (tok->prompt != NULL)
1028 PySys_WriteStderr("\n");
1029 tok->cur = tok->inp;
1030 return EOF;
1031 }
1032 }
1033 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001034}
1035
1036
1037/* Back-up one character */
1038
1039static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001040tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 if (c != EOF) {
1043 if (--tok->cur < tok->buf)
1044 Py_FatalError("tok_backup: beginning of buffer");
1045 if (*tok->cur != c)
1046 *tok->cur = c;
1047 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048}
1049
1050
Guido van Rossum926f13a1998-04-09 21:38:06 +00001051static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001052syntaxerror(struct tok_state *tok, const char *format, ...)
1053{
1054#ifndef PGEN
1055 va_list vargs;
1056#ifdef HAVE_STDARG_PROTOTYPES
1057 va_start(vargs, format);
1058#else
1059 va_start(vargs);
1060#endif
1061 PyErr_FormatV(PyExc_SyntaxError, format, vargs);
1062 va_end(vargs);
1063 PyErr_SyntaxLocationObject(tok->filename,
1064 tok->lineno,
Victor Stinnerc8846162018-07-21 03:36:06 +02001065 (int)(tok->cur - tok->line_start));
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001066 tok->done = E_ERROR;
1067#else
1068 tok->done = E_TOKEN;
1069#endif
1070 return ERRORTOKEN;
1071}
1072
1073static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001074indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001075{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001076 tok->done = E_TABSPACE;
1077 tok->cur = tok->inp;
1078 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001079}
1080
Martin v. Löwis47383402007-08-15 07:32:56 +00001081#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001082#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001083#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084/* Verify that the identifier follows PEP 3131.
1085 All identifier strings are guaranteed to be "ready" unicode objects.
1086 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001087static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001088verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001089{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 PyObject *s;
1091 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001092 if (tok->decoding_erred)
1093 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001095 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001096 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1097 PyErr_Clear();
1098 tok->done = E_IDENTIFIER;
1099 } else {
1100 tok->done = E_ERROR;
1101 }
1102 return 0;
1103 }
1104 result = PyUnicode_IsIdentifier(s);
1105 Py_DECREF(s);
1106 if (result == 0)
1107 tok->done = E_IDENTIFIER;
1108 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001109}
1110#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001111
Brett Cannona721aba2016-09-09 14:57:09 -07001112static int
1113tok_decimal_tail(struct tok_state *tok)
1114{
1115 int c;
1116
1117 while (1) {
1118 do {
1119 c = tok_nextc(tok);
1120 } while (isdigit(c));
1121 if (c != '_') {
1122 break;
1123 }
1124 c = tok_nextc(tok);
1125 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001126 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001127 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001128 return 0;
1129 }
1130 }
1131 return c;
1132}
1133
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134/* Get next token, after space stripping etc. */
1135
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001136static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001137tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001139 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001141
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 tok->start = NULL;
1145 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 /* Get indentation level */
1148 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001149 int col = 0;
1150 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 tok->atbol = 0;
1152 for (;;) {
1153 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001154 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001156 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001158 col = (col / tok->tabsize + 1) * tok->tabsize;
1159 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 }
Brett Cannona721aba2016-09-09 14:57:09 -07001161 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001163 }
1164 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001166 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167 }
1168 tok_backup(tok, c);
1169 if (c == '#' || c == '\n') {
1170 /* Lines with only whitespace and/or comments
1171 shouldn't affect the indentation and are
1172 not passed to the parser as NEWLINE tokens,
1173 except *totally* empty lines in interactive
1174 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001175 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001176 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001177 }
1178 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001180 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 /* We can't jump back right here since we still
1182 may need to skip to the end of a comment */
1183 }
1184 if (!blankline && tok->level == 0) {
1185 if (col == tok->indstack[tok->indent]) {
1186 /* No change */
1187 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001188 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 }
1190 }
1191 else if (col > tok->indstack[tok->indent]) {
1192 /* Indent -- always one */
1193 if (tok->indent+1 >= MAXINDENT) {
1194 tok->done = E_TOODEEP;
1195 tok->cur = tok->inp;
1196 return ERRORTOKEN;
1197 }
1198 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001199 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 }
1201 tok->pendin++;
1202 tok->indstack[++tok->indent] = col;
1203 tok->altindstack[tok->indent] = altcol;
1204 }
1205 else /* col < tok->indstack[tok->indent] */ {
1206 /* Dedent -- any number, must be consistent */
1207 while (tok->indent > 0 &&
1208 col < tok->indstack[tok->indent]) {
1209 tok->pendin--;
1210 tok->indent--;
1211 }
1212 if (col != tok->indstack[tok->indent]) {
1213 tok->done = E_DEDENT;
1214 tok->cur = tok->inp;
1215 return ERRORTOKEN;
1216 }
1217 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001218 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 }
1220 }
1221 }
1222 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001223
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001225
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 /* Return pending indents/dedents */
1227 if (tok->pendin != 0) {
1228 if (tok->pendin < 0) {
1229 tok->pendin++;
1230 return DEDENT;
1231 }
1232 else {
1233 tok->pendin--;
1234 return INDENT;
1235 }
1236 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001237
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001238 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 tok->start = NULL;
1240 /* Skip spaces */
1241 do {
1242 c = tok_nextc(tok);
1243 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 /* Set start of current token */
1246 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001247
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 /* Skip comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001249 if (c == '#') {
1250 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001252 }
1253 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255 /* Check for EOF and errors now */
1256 if (c == EOF) {
1257 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1258 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 /* Identifier (most frequent token!) */
1261 nonascii = 0;
1262 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001263 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001264 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001265 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001266 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001267 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001268 /* Since this is a backwards compatibility support literal we don't
1269 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001270 else if (!(saw_b || saw_u || saw_r || saw_f)
1271 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001272 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001273 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001274 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001275 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001276 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001277 }
1278 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001279 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001280 }
1281 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001282 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001283 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001285 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001287 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 }
1289 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001290 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001292 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 c = tok_nextc(tok);
1294 }
1295 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001296 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001298 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 *p_start = tok->start;
1300 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001301
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 return NAME;
1303 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 /* Newline */
1306 if (c == '\n') {
1307 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001308 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001310 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 *p_start = tok->start;
1312 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1313 tok->cont_line = 0;
1314 return NEWLINE;
1315 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001316
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 /* Period or number starting with period? */
1318 if (c == '.') {
1319 c = tok_nextc(tok);
1320 if (isdigit(c)) {
1321 goto fraction;
1322 } else if (c == '.') {
1323 c = tok_nextc(tok);
1324 if (c == '.') {
1325 *p_start = tok->start;
1326 *p_end = tok->cur;
1327 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001328 }
1329 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 tok_backup(tok, c);
1331 }
1332 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001333 }
1334 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 tok_backup(tok, c);
1336 }
1337 *p_start = tok->start;
1338 *p_end = tok->cur;
1339 return DOT;
1340 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001341
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 /* Number */
1343 if (isdigit(c)) {
1344 if (c == '0') {
1345 /* Hex, octal or binary -- maybe. */
1346 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001347 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 /* Hex */
1349 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001351 if (c == '_') {
1352 c = tok_nextc(tok);
1353 }
1354 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001355 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001356 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001357 }
1358 do {
1359 c = tok_nextc(tok);
1360 } while (isxdigit(c));
1361 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 }
1363 else if (c == 'o' || c == 'O') {
1364 /* Octal */
1365 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001367 if (c == '_') {
1368 c = tok_nextc(tok);
1369 }
1370 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001371 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001372 if (isdigit(c)) {
1373 return syntaxerror(tok,
1374 "invalid digit '%c' in octal literal", c);
1375 }
1376 else {
1377 return syntaxerror(tok, "invalid octal literal");
1378 }
Brett Cannona721aba2016-09-09 14:57:09 -07001379 }
1380 do {
1381 c = tok_nextc(tok);
1382 } while ('0' <= c && c < '8');
1383 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001384 if (isdigit(c)) {
1385 return syntaxerror(tok,
1386 "invalid digit '%c' in octal literal", c);
1387 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 }
1389 else if (c == 'b' || c == 'B') {
1390 /* Binary */
1391 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001393 if (c == '_') {
1394 c = tok_nextc(tok);
1395 }
1396 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001397 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001398 if (isdigit(c)) {
1399 return syntaxerror(tok,
1400 "invalid digit '%c' in binary literal", c);
1401 }
1402 else {
1403 return syntaxerror(tok, "invalid binary literal");
1404 }
Brett Cannona721aba2016-09-09 14:57:09 -07001405 }
1406 do {
1407 c = tok_nextc(tok);
1408 } while (c == '0' || c == '1');
1409 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001410 if (isdigit(c)) {
1411 return syntaxerror(tok,
1412 "invalid digit '%c' in binary literal", c);
1413 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 }
1415 else {
1416 int nonzero = 0;
1417 /* maybe old-style octal; c is first char of it */
1418 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001419 while (1) {
1420 if (c == '_') {
1421 c = tok_nextc(tok);
1422 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001423 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001424 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001425 }
1426 }
1427 if (c != '0') {
1428 break;
1429 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 c = tok_nextc(tok);
1431 }
Brett Cannona721aba2016-09-09 14:57:09 -07001432 if (isdigit(c)) {
1433 nonzero = 1;
1434 c = tok_decimal_tail(tok);
1435 if (c == 0) {
1436 return ERRORTOKEN;
1437 }
1438 }
1439 if (c == '.') {
1440 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001442 }
1443 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001445 }
1446 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001448 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001450 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001452 return syntaxerror(tok,
1453 "leading zeros in decimal integer "
1454 "literals are not permitted; "
1455 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 }
1457 }
1458 }
1459 else {
1460 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001461 c = tok_decimal_tail(tok);
1462 if (c == 0) {
1463 return ERRORTOKEN;
1464 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 {
1466 /* Accept floating point numbers. */
1467 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001468 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 fraction:
1470 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001471 if (isdigit(c)) {
1472 c = tok_decimal_tail(tok);
1473 if (c == 0) {
1474 return ERRORTOKEN;
1475 }
1476 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 }
1478 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001479 int e;
1480 exponent:
1481 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 /* Exponent part */
1483 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001484 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001486 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001487 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001488 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001489 }
1490 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001492 tok_backup(tok, e);
1493 *p_start = tok->start;
1494 *p_end = tok->cur;
1495 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 }
Brett Cannona721aba2016-09-09 14:57:09 -07001497 c = tok_decimal_tail(tok);
1498 if (c == 0) {
1499 return ERRORTOKEN;
1500 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 }
Brett Cannona721aba2016-09-09 14:57:09 -07001502 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001503 /* Imaginary part */
1504 imaginary:
1505 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001506 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 }
1508 }
1509 tok_backup(tok, c);
1510 *p_start = tok->start;
1511 *p_end = tok->cur;
1512 return NUMBER;
1513 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001514
1515 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001516 /* String */
1517 if (c == '\'' || c == '"') {
1518 int quote = c;
1519 int quote_size = 1; /* 1 or 3 */
1520 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001521
Anthony Sottile995d9b92019-01-12 20:05:13 -08001522 /* Nodes of type STRING, especially multi line strings
1523 must be handled differently in order to get both
1524 the starting line number and the column offset right.
1525 (cf. issue 16806) */
1526 tok->first_lineno = tok->lineno;
1527 tok->multi_line_start = tok->line_start;
1528
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001529 /* Find the quote size and start of string */
1530 c = tok_nextc(tok);
1531 if (c == quote) {
1532 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001533 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001535 }
1536 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001538 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 }
Brett Cannona721aba2016-09-09 14:57:09 -07001540 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001541 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001542 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001543
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001544 /* Get rest of string */
1545 while (end_quote_size != quote_size) {
1546 c = tok_nextc(tok);
1547 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001548 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001550 }
1551 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001552 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001553 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554 tok->cur = tok->inp;
1555 return ERRORTOKEN;
1556 }
1557 if (quote_size == 1 && c == '\n') {
1558 tok->done = E_EOLS;
1559 tok->cur = tok->inp;
1560 return ERRORTOKEN;
1561 }
Brett Cannona721aba2016-09-09 14:57:09 -07001562 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001564 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001565 else {
1566 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001567 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001568 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001569 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001570 }
1571 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001572
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001573 *p_start = tok->start;
1574 *p_end = tok->cur;
1575 return STRING;
1576 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001577
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001578 /* Line continuation */
1579 if (c == '\\') {
1580 c = tok_nextc(tok);
1581 if (c != '\n') {
1582 tok->done = E_LINECONT;
1583 tok->cur = tok->inp;
1584 return ERRORTOKEN;
1585 }
1586 tok->cont_line = 1;
1587 goto again; /* Read next line */
1588 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 /* Check for two-character token */
1591 {
1592 int c2 = tok_nextc(tok);
1593 int token = PyToken_TwoChars(c, c2);
1594 if (token != OP) {
1595 int c3 = tok_nextc(tok);
1596 int token3 = PyToken_ThreeChars(c, c2, c3);
1597 if (token3 != OP) {
1598 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001599 }
1600 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 tok_backup(tok, c3);
1602 }
1603 *p_start = tok->start;
1604 *p_end = tok->cur;
1605 return token;
1606 }
1607 tok_backup(tok, c2);
1608 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 /* Keep track of parentheses nesting level */
1611 switch (c) {
1612 case '(':
1613 case '[':
1614 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001615#ifndef PGEN
1616 if (tok->level >= MAXLEVEL) {
1617 return syntaxerror(tok, "too many nested parentheses");
1618 }
1619 tok->parenstack[tok->level] = c;
1620 tok->parenlinenostack[tok->level] = tok->lineno;
1621#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 tok->level++;
1623 break;
1624 case ')':
1625 case ']':
1626 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001627#ifndef PGEN
1628 if (!tok->level) {
1629 return syntaxerror(tok, "unmatched '%c'", c);
1630 }
1631#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001632 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001633#ifndef PGEN
1634 int opening = tok->parenstack[tok->level];
1635 if (!((opening == '(' && c == ')') ||
1636 (opening == '[' && c == ']') ||
1637 (opening == '{' && c == '}')))
1638 {
1639 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1640 return syntaxerror(tok,
1641 "closing parenthesis '%c' does not match "
1642 "opening parenthesis '%c' on line %d",
1643 c, opening, tok->parenlinenostack[tok->level]);
1644 }
1645 else {
1646 return syntaxerror(tok,
1647 "closing parenthesis '%c' does not match "
1648 "opening parenthesis '%c'",
1649 c, opening);
1650 }
1651 }
1652#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 break;
1654 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001655
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001656 /* Punctuation character */
1657 *p_start = tok->start;
1658 *p_end = tok->cur;
1659 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001660}
1661
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001662int
1663PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1664{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 int result = tok_get(tok, p_start, p_end);
1666 if (tok->decoding_erred) {
1667 result = ERRORTOKEN;
1668 tok->done = E_DECODE;
1669 }
1670 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001671}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001672
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001673/* Get the encoding of a Python file. Check for the coding cookie and check if
1674 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001675
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001676 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1677 encoding in the first or second line of the file (in which case the encoding
1678 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001679
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001680 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1681 by the caller. */
1682
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001683char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001684PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001685{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 struct tok_state *tok;
1687 FILE *fp;
1688 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001689
Victor Stinnerdaf45552013-08-28 00:53:59 +02001690#ifndef PGEN
1691 fd = _Py_dup(fd);
1692#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001694#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 if (fd < 0) {
1696 return NULL;
1697 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001698
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 fp = fdopen(fd, "r");
1700 if (fp == NULL) {
1701 return NULL;
1702 }
1703 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1704 if (tok == NULL) {
1705 fclose(fp);
1706 return NULL;
1707 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001708#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001709 if (filename != NULL) {
1710 Py_INCREF(filename);
1711 tok->filename = filename;
1712 }
1713 else {
1714 tok->filename = PyUnicode_FromString("<string>");
1715 if (tok->filename == NULL) {
1716 fclose(fp);
1717 PyTokenizer_Free(tok);
1718 return encoding;
1719 }
1720 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001721#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 while (tok->lineno < 2 && tok->done == E_OK) {
1723 PyTokenizer_Get(tok, &p_start, &p_end);
1724 }
1725 fclose(fp);
1726 if (tok->encoding) {
1727 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1728 if (encoding)
1729 strcpy(encoding, tok->encoding);
1730 }
1731 PyTokenizer_Free(tok);
1732 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001733}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001734
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001735char *
1736PyTokenizer_FindEncoding(int fd)
1737{
1738 return PyTokenizer_FindEncodingFilename(fd, NULL);
1739}
1740
Guido van Rossum408027e1996-12-30 16:17:54 +00001741#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001742
1743void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001744tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001745{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001746 printf("%s", _PyParser_TokenNames[type]);
1747 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1748 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001749}
1750
1751#endif