blob: 0e6c1a85e035b0fad549e079ff46a3c794c30655 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080021/* Alternate tab spacing */
22#define ALTTABSIZE 1
23
Martin v. Löwis5b222132007-06-10 09:51:05 +000024#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000025 (c >= 'a' && c <= 'z')\
26 || (c >= 'A' && c <= 'Z')\
27 || c == '_'\
28 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000029
30#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 (c >= 'a' && c <= 'z')\
32 || (c >= 'A' && c <= 'Z')\
33 || (c >= '0' && c <= '9')\
34 || c == '_'\
35 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000036
Serhiy Storchakac6792272013-10-19 21:03:34 +030037extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000038/* Return malloc'ed string including trailing \n;
39 empty malloc'ed string for EOF;
40 NULL if interrupted */
41
Guido van Rossum4fe87291992-02-26 15:24:44 +000042/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044
Guido van Rossum3f5da241990-12-20 15:06:42 +000045/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000046static struct tok_state *tok_new(void);
47static int tok_nextc(struct tok_state *tok);
48static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000049
Brett Cannond5ec98c2007-10-20 02:54:14 +000050
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051/* Create and initialize a new tok_state structure */
52
53static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000054tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000056 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
57 sizeof(struct tok_state));
58 if (tok == NULL)
59 return NULL;
60 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
61 tok->done = E_OK;
62 tok->fp = NULL;
63 tok->input = NULL;
64 tok->tabsize = TABSIZE;
65 tok->indent = 0;
66 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040067
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000068 tok->atbol = 1;
69 tok->pendin = 0;
70 tok->prompt = tok->nextprompt = NULL;
71 tok->lineno = 0;
72 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000073 tok->altindstack[0] = 0;
74 tok->decoding_state = STATE_INIT;
75 tok->decoding_erred = 0;
76 tok->read_coding_spec = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000080#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +020081 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000084#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +030085
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000086 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000087}
88
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000089static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070090new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000091{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000092 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070093 if (!result) {
94 tok->done = E_NOMEM;
95 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000096 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070097 memcpy(result, s, len);
98 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000100}
101
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000102#ifdef PGEN
103
104static char *
105decoding_fgets(char *s, int size, struct tok_state *tok)
106{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000108}
109
110static int
111decoding_feof(struct tok_state *tok)
112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000114}
115
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000116static char *
117decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000118{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700119 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000120}
121
122#else /* PGEN */
123
124static char *
125error_ret(struct tok_state *tok) /* XXX */
126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 tok->decoding_erred = 1;
128 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
129 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200130 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
131 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133}
134
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200136static const char *
137get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 char buf[13];
140 int i;
141 for (i = 0; i < 12; i++) {
142 int c = s[i];
143 if (c == '\0')
144 break;
145 else if (c == '_')
146 buf[i] = '-';
147 else
148 buf[i] = tolower(c);
149 }
150 buf[i] = '\0';
151 if (strcmp(buf, "utf-8") == 0 ||
152 strncmp(buf, "utf-8-", 6) == 0)
153 return "utf-8";
154 else if (strcmp(buf, "latin-1") == 0 ||
155 strcmp(buf, "iso-8859-1") == 0 ||
156 strcmp(buf, "iso-latin-1") == 0 ||
157 strncmp(buf, "latin-1-", 8) == 0 ||
158 strncmp(buf, "iso-8859-1-", 11) == 0 ||
159 strncmp(buf, "iso-latin-1-", 12) == 0)
160 return "iso-8859-1";
161 else
162 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165/* Return the coding spec in S, or NULL if none is found. */
166
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700167static int
168get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700171 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 /* Coding spec must be in a comment, and that comment must be
173 * the only statement on the source code line. */
174 for (i = 0; i < size - 6; i++) {
175 if (s[i] == '#')
176 break;
177 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700178 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 }
180 for (; i < size - 6; i++) { /* XXX inefficient search */
181 const char* t = s + i;
182 if (strncmp(t, "coding", 6) == 0) {
183 const char* begin = NULL;
184 t += 6;
185 if (t[0] != ':' && t[0] != '=')
186 continue;
187 do {
188 t++;
189 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 begin = t;
192 while (Py_ISALNUM(t[0]) ||
193 t[0] == '-' || t[0] == '_' || t[0] == '.')
194 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700197 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200198 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700199 if (!r)
200 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700201 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000202 if (r != q) {
203 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700204 r = new_string(q, strlen(q), tok);
205 if (!r)
206 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700208 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200209 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210 }
211 }
212 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000214}
215
216/* Check whether the line contains a coding spec. If it does,
217 invoke the set_readline function for the new encoding.
218 This function receives the tok_state and the new encoding.
219 Return 1 on success, 0 on failure. */
220
221static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000222check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700225 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000227
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200228 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200230 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200232 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233 if (!get_coding_spec(line, &cs, size, tok))
234 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200235 if (!cs) {
236 Py_ssize_t i;
237 for (i = 0; i < size; i++) {
238 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
239 break;
240 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
241 /* Stop checking coding spec after a line containing
242 * anything except a comment. */
243 tok->read_coding_spec = 1;
244 break;
245 }
246 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200248 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700249 tok->read_coding_spec = 1;
250 if (tok->encoding == NULL) {
251 assert(tok->decoding_state == STATE_RAW);
252 if (strcmp(cs, "utf-8") == 0) {
253 tok->encoding = cs;
254 } else {
255 r = set_readline(tok, cs);
256 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700260 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300261 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700262 "encoding problem: %s", cs);
263 PyMem_FREE(cs);
264 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 } else { /* then, compare cs with BOM */
267 r = (strcmp(tok->encoding, cs) == 0);
268 if (!r)
269 PyErr_Format(PyExc_SyntaxError,
270 "encoding problem: %s with BOM", cs);
271 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000272 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274}
275
276/* See whether the file starts with a BOM. If it does,
277 invoke the set_readline function with the new encoding.
278 Return 1 on success, 0 on failure. */
279
280static int
281check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 void unget_char(int, struct tok_state *),
283 int set_readline(struct tok_state *, const char *),
284 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 int ch1, ch2, ch3;
287 ch1 = get_char(tok);
288 tok->decoding_state = STATE_RAW;
289 if (ch1 == EOF) {
290 return 1;
291 } else if (ch1 == 0xEF) {
292 ch2 = get_char(tok);
293 if (ch2 != 0xBB) {
294 unget_char(ch2, tok);
295 unget_char(ch1, tok);
296 return 1;
297 }
298 ch3 = get_char(tok);
299 if (ch3 != 0xBF) {
300 unget_char(ch3, tok);
301 unget_char(ch2, tok);
302 unget_char(ch1, tok);
303 return 1;
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 /* Disable support for UTF-16 BOMs until a decision
307 is made whether this needs to be supported. */
308 } else if (ch1 == 0xFE) {
309 ch2 = get_char(tok);
310 if (ch2 != 0xFF) {
311 unget_char(ch2, tok);
312 unget_char(ch1, tok);
313 return 1;
314 }
315 if (!set_readline(tok, "utf-16-be"))
316 return 0;
317 tok->decoding_state = STATE_NORMAL;
318 } else if (ch1 == 0xFF) {
319 ch2 = get_char(tok);
320 if (ch2 != 0xFE) {
321 unget_char(ch2, tok);
322 unget_char(ch1, tok);
323 return 1;
324 }
325 if (!set_readline(tok, "utf-16-le"))
326 return 0;
327 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000328#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 } else {
330 unget_char(ch1, tok);
331 return 1;
332 }
333 if (tok->encoding != NULL)
334 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700335 tok->encoding = new_string("utf-8", 5, tok);
336 if (!tok->encoding)
337 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 /* No need to set_readline: input is already utf-8 */
339 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340}
341
342/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000344
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000345 On entry, tok->decoding_buffer will be one of:
346 1) NULL: need to call tok->decoding_readline to get a new line
347 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000349 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 (in the s buffer) to copy entire contents of the line read
351 by tok->decoding_readline. tok->decoding_buffer has the overflow.
352 In this case, fp_readl is called in a loop (with an expanded buffer)
353 until the buffer ends with a '\n' (or until the end of the file is
354 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356
357static char *
358fp_readl(char *s, int size, struct tok_state *tok)
359{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 PyObject* bufobj;
361 const char *buf;
362 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Ask for one less byte so we can terminate it */
365 assert(size > 0);
366 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000368 if (tok->decoding_buffer) {
369 bufobj = tok->decoding_buffer;
370 Py_INCREF(bufobj);
371 }
372 else
373 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100374 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 if (bufobj == NULL)
376 goto error;
377 }
378 if (PyUnicode_CheckExact(bufobj))
379 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200380 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 if (buf == NULL) {
382 goto error;
383 }
384 }
385 else
386 {
387 buf = PyByteArray_AsString(bufobj);
388 if (buf == NULL) {
389 goto error;
390 }
391 buflen = PyByteArray_GET_SIZE(bufobj);
392 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 Py_XDECREF(tok->decoding_buffer);
395 if (buflen > size) {
396 /* Too many chars, the rest goes into tok->decoding_buffer */
397 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
398 buflen-size);
399 if (tok->decoding_buffer == NULL)
400 goto error;
401 buflen = size;
402 }
403 else
404 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000405
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 memcpy(s, buf, buflen);
407 s[buflen] = '\0';
408 if (buflen == 0) /* EOF */
409 s = NULL;
410 Py_DECREF(bufobj);
411 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000412
413error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000414 Py_XDECREF(bufobj);
415 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000416}
417
418/* Set the readline function for TOK to a StreamReader's
419 readline function. The StreamReader is named ENC.
420
421 This function is called from check_bom and check_coding_spec.
422
423 ENC is usually identical to the future value of tok->encoding,
424 except for the (currently unsupported) case of UTF-16.
425
426 Return 1 on success, 0 on failure. */
427
428static int
429fp_setreadl(struct tok_state *tok, const char* enc)
430{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700431 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200432 _Py_IDENTIFIER(open);
433 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000434 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200435 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
Victor Stinner22a351a2010-10-14 12:04:34 +0000437 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200438 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100439 * position of tok->fp. If tok->fp was opened in text mode on Windows,
440 * its file position counts CRLF as one char and can't be directly mapped
441 * to the file offset for fd. Instead we step back one byte and read to
442 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200443 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100444 if (pos == -1 ||
445 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000446 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700447 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000448 }
449
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700450 io = PyImport_ImportModuleNoBlock("io");
451 if (io == NULL)
452 return 0;
453
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200454 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000455 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700456 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000457 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700458 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200460 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 Py_DECREF(stream);
462 if (readline == NULL)
463 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300464 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700465
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100466 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100467 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700468 if (bufobj == NULL)
469 return 0;
470 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100471 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700473 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Fetch the next byte from TOK. */
477
478static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000480}
481
482/* Unfetch the last byte back into TOK. */
483
484static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000485 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486}
487
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000488/* Check whether the characters at s start a valid
489 UTF-8 sequence. Return the number of characters forming
490 the sequence if yes, 0 if not. */
491static int valid_utf8(const unsigned char* s)
492{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000493 int expected = 0;
494 int length;
495 if (*s < 0x80)
496 /* single-byte code */
497 return 1;
498 if (*s < 0xc0)
499 /* following byte */
500 return 0;
501 if (*s < 0xE0)
502 expected = 1;
503 else if (*s < 0xF0)
504 expected = 2;
505 else if (*s < 0xF8)
506 expected = 3;
507 else
508 return 0;
509 length = expected + 1;
510 for (; expected; expected--)
511 if (s[expected] < 0x80 || s[expected] >= 0xC0)
512 return 0;
513 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000514}
515
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516/* Read a line of input from TOK. Determine encoding
517 if necessary. */
518
519static char *
520decoding_fgets(char *s, int size, struct tok_state *tok)
521{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 char *line = NULL;
523 int badchar = 0;
524 for (;;) {
525 if (tok->decoding_state == STATE_NORMAL) {
526 /* We already have a codec associated with
527 this input. */
528 line = fp_readl(s, size, tok);
529 break;
530 } else if (tok->decoding_state == STATE_RAW) {
531 /* We want a 'raw' read. */
532 line = Py_UniversalNewlineFgets(s, size,
533 tok->fp, NULL);
534 break;
535 } else {
536 /* We have not yet determined the encoding.
537 If an encoding is found, use the file-pointer
538 reader functions from now on. */
539 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
540 return error_ret(tok);
541 assert(tok->decoding_state != STATE_INIT);
542 }
543 }
544 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
545 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
546 return error_ret(tok);
547 }
548 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 /* The default encoding is UTF-8, so make sure we don't have any
551 non-UTF-8 sequences in it. */
552 if (line && !tok->encoding) {
553 unsigned char *c;
554 int length;
555 for (c = (unsigned char *)line; *c; c += length)
556 if (!(length = valid_utf8(c))) {
557 badchar = *c;
558 break;
559 }
560 }
561 if (badchar) {
562 /* Need to add 1 to the line number, since this line
563 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200564 PyErr_Format(PyExc_SyntaxError,
565 "Non-UTF-8 code starting with '\\x%.2x' "
566 "in file %U on line %i, "
567 "but no encoding declared; "
568 "see http://python.org/dev/peps/pep-0263/ for details",
569 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000570 return error_ret(tok);
571 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000573 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574}
575
576static int
577decoding_feof(struct tok_state *tok)
578{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 if (tok->decoding_state != STATE_NORMAL) {
580 return feof(tok->fp);
581 } else {
582 PyObject* buf = tok->decoding_buffer;
583 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100584 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000585 if (buf == NULL) {
586 error_ret(tok);
587 return 1;
588 } else {
589 tok->decoding_buffer = buf;
590 }
591 }
592 return PyObject_Length(buf) == 0;
593 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594}
595
596/* Fetch a byte from TOK, using the string buffer. */
597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000598static int
599buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000600 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Unfetch a byte from TOK, using the string buffer. */
604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605static void
606buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 tok->str--;
608 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609}
610
611/* Set the readline function for TOK to ENC. For the string-based
612 tokenizer, this means to just record the encoding. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 tok->enc = enc;
617 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618}
619
620/* Return a UTF-8 encoding Python string object from the
621 C byte string STR, which is encoded with ENC. */
622
623static PyObject *
624translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 PyObject *utf8;
626 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
627 if (buf == NULL)
628 return NULL;
629 utf8 = PyUnicode_AsUTF8String(buf);
630 Py_DECREF(buf);
631 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000634
635static char *
636translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200637 int skip_next_lf = 0;
638 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 char *buf, *current;
640 char c = '\0';
641 buf = PyMem_MALLOC(needed_length);
642 if (buf == NULL) {
643 tok->done = E_NOMEM;
644 return NULL;
645 }
646 for (current = buf; *s; s++, current++) {
647 c = *s;
648 if (skip_next_lf) {
649 skip_next_lf = 0;
650 if (c == '\n') {
651 c = *++s;
652 if (!c)
653 break;
654 }
655 }
656 if (c == '\r') {
657 skip_next_lf = 1;
658 c = '\n';
659 }
660 *current = c;
661 }
662 /* If this is exec input, add a newline to the end of the string if
663 there isn't one already. */
664 if (exec_input && c != '\n') {
665 *current = '\n';
666 current++;
667 }
668 *current = '\0';
669 final_length = current - buf + 1;
670 if (final_length < needed_length && final_length)
671 /* should never fail */
672 buf = PyMem_REALLOC(buf, final_length);
673 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000674}
675
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676/* Decode a byte string STR for use as the buffer of TOK.
677 Look for encoding declarations inside STR, and record them
678 inside TOK. */
679
680static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000681decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 PyObject* utf8 = NULL;
684 const char *str;
685 const char *s;
686 const char *newl[2] = {NULL, NULL};
687 int lineno = 0;
688 tok->input = str = translate_newlines(input, single, tok);
689 if (str == NULL)
690 return NULL;
691 tok->enc = NULL;
692 tok->str = str;
693 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
694 return error_ret(tok);
695 str = tok->str; /* string after BOM if any */
696 assert(str);
697 if (tok->enc != NULL) {
698 utf8 = translate_into_utf8(str, tok->enc);
699 if (utf8 == NULL)
700 return error_ret(tok);
701 str = PyBytes_AsString(utf8);
702 }
703 for (s = str;; s++) {
704 if (*s == '\0') break;
705 else if (*s == '\n') {
706 assert(lineno < 2);
707 newl[lineno] = s;
708 lineno++;
709 if (lineno == 2) break;
710 }
711 }
712 tok->enc = NULL;
713 /* need to check line 1 and 2 separately since check_coding_spec
714 assumes a single line as input */
715 if (newl[0]) {
716 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
717 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200718 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
720 tok, buf_setreadl))
721 return error_ret(tok);
722 }
723 }
724 if (tok->enc != NULL) {
725 assert(utf8 == NULL);
726 utf8 = translate_into_utf8(str, tok->enc);
727 if (utf8 == NULL)
728 return error_ret(tok);
729 str = PyBytes_AS_STRING(utf8);
730 }
731 assert(tok->decoding_buffer == NULL);
732 tok->decoding_buffer = utf8; /* CAUTION */
733 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734}
735
736#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737
738/* Set up tokenizer for string */
739
740struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000741PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 struct tok_state *tok = tok_new();
744 if (tok == NULL)
745 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300746 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 if (str == NULL) {
748 PyTokenizer_Free(tok);
749 return NULL;
750 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000751
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 /* XXX: constify members. */
753 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
754 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755}
756
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000757struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000758PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000759{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 struct tok_state *tok = tok_new();
761 if (tok == NULL)
762 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000765#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 if (str == NULL) {
767 PyTokenizer_Free(tok);
768 return NULL;
769 }
770 tok->decoding_state = STATE_RAW;
771 tok->read_coding_spec = 1;
772 tok->enc = NULL;
773 tok->str = str;
774 tok->encoding = (char *)PyMem_MALLOC(6);
775 if (!tok->encoding) {
776 PyTokenizer_Free(tok);
777 return NULL;
778 }
779 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000780
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 /* XXX: constify members. */
782 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
783 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000784}
785
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000786/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787
788struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300789PyTokenizer_FromFile(FILE *fp, const char* enc,
790 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 struct tok_state *tok = tok_new();
793 if (tok == NULL)
794 return NULL;
795 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
796 PyTokenizer_Free(tok);
797 return NULL;
798 }
799 tok->cur = tok->inp = tok->buf;
800 tok->end = tok->buf + BUFSIZ;
801 tok->fp = fp;
802 tok->prompt = ps1;
803 tok->nextprompt = ps2;
804 if (enc != NULL) {
805 /* Must copy encoding declaration since it
806 gets copied into the parse tree. */
807 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
808 if (!tok->encoding) {
809 PyTokenizer_Free(tok);
810 return NULL;
811 }
812 strcpy(tok->encoding, enc);
813 tok->decoding_state = STATE_NORMAL;
814 }
815 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816}
817
818
819/* Free a tok_state structure */
820
821void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000822PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 if (tok->encoding != NULL)
825 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000826#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 Py_XDECREF(tok->decoding_readline);
828 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200829 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000830#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 if (tok->fp != NULL && tok->buf != NULL)
832 PyMem_FREE(tok->buf);
833 if (tok->input)
834 PyMem_FREE((char *)tok->input);
835 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836}
837
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838/* Get next char, updating state; error code goes into tok->done */
839
840static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200841tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 for (;;) {
844 if (tok->cur != tok->inp) {
845 return Py_CHARMASK(*tok->cur++); /* Fast path */
846 }
847 if (tok->done != E_OK)
848 return EOF;
849 if (tok->fp == NULL) {
850 char *end = strchr(tok->inp, '\n');
851 if (end != NULL)
852 end++;
853 else {
854 end = strchr(tok->inp, '\0');
855 if (end == tok->inp) {
856 tok->done = E_EOF;
857 return EOF;
858 }
859 }
860 if (tok->start == NULL)
861 tok->buf = tok->cur;
862 tok->line_start = tok->cur;
863 tok->lineno++;
864 tok->inp = end;
865 return Py_CHARMASK(*tok->cur++);
866 }
867 if (tok->prompt != NULL) {
868 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000869#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000870 if (newtok != NULL) {
871 char *translated = translate_newlines(newtok, 0, tok);
872 PyMem_FREE(newtok);
873 if (translated == NULL)
874 return EOF;
875 newtok = translated;
876 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000877 if (tok->encoding && newtok && *newtok) {
878 /* Recode to UTF-8 */
879 Py_ssize_t buflen;
880 const char* buf;
881 PyObject *u = translate_into_utf8(newtok, tok->encoding);
882 PyMem_FREE(newtok);
883 if (!u) {
884 tok->done = E_DECODE;
885 return EOF;
886 }
887 buflen = PyBytes_GET_SIZE(u);
888 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700890 if (newtok == NULL) {
891 Py_DECREF(u);
892 tok->done = E_NOMEM;
893 return EOF;
894 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000895 strcpy(newtok, buf);
896 Py_DECREF(u);
897 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000898#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 if (tok->nextprompt != NULL)
900 tok->prompt = tok->nextprompt;
901 if (newtok == NULL)
902 tok->done = E_INTR;
903 else if (*newtok == '\0') {
904 PyMem_FREE(newtok);
905 tok->done = E_EOF;
906 }
907 else if (tok->start != NULL) {
908 size_t start = tok->start - tok->buf;
909 size_t oldlen = tok->cur - tok->buf;
910 size_t newlen = oldlen + strlen(newtok);
911 char *buf = tok->buf;
912 buf = (char *)PyMem_REALLOC(buf, newlen+1);
913 tok->lineno++;
914 if (buf == NULL) {
915 PyMem_FREE(tok->buf);
916 tok->buf = NULL;
917 PyMem_FREE(newtok);
918 tok->done = E_NOMEM;
919 return EOF;
920 }
921 tok->buf = buf;
922 tok->cur = tok->buf + oldlen;
923 tok->line_start = tok->cur;
924 strcpy(tok->buf + oldlen, newtok);
925 PyMem_FREE(newtok);
926 tok->inp = tok->buf + newlen;
927 tok->end = tok->inp + 1;
928 tok->start = tok->buf + start;
929 }
930 else {
931 tok->lineno++;
932 if (tok->buf != NULL)
933 PyMem_FREE(tok->buf);
934 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 tok->cur = tok->buf;
936 tok->line_start = tok->buf;
937 tok->inp = strchr(tok->buf, '\0');
938 tok->end = tok->inp + 1;
939 }
940 }
941 else {
942 int done = 0;
943 Py_ssize_t cur = 0;
944 char *pt;
945 if (tok->start == NULL) {
946 if (tok->buf == NULL) {
947 tok->buf = (char *)
948 PyMem_MALLOC(BUFSIZ);
949 if (tok->buf == NULL) {
950 tok->done = E_NOMEM;
951 return EOF;
952 }
953 tok->end = tok->buf + BUFSIZ;
954 }
955 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
956 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200957 if (!tok->decoding_erred)
958 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000959 done = 1;
960 }
961 else {
962 tok->done = E_OK;
963 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700964 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000965 }
966 }
967 else {
968 cur = tok->cur - tok->buf;
969 if (decoding_feof(tok)) {
970 tok->done = E_EOF;
971 done = 1;
972 }
973 else
974 tok->done = E_OK;
975 }
976 tok->lineno++;
977 /* Read until '\n' or EOF */
978 while (!done) {
979 Py_ssize_t curstart = tok->start == NULL ? -1 :
980 tok->start - tok->buf;
981 Py_ssize_t curvalid = tok->inp - tok->buf;
982 Py_ssize_t newsize = curvalid + BUFSIZ;
983 char *newbuf = tok->buf;
984 newbuf = (char *)PyMem_REALLOC(newbuf,
985 newsize);
986 if (newbuf == NULL) {
987 tok->done = E_NOMEM;
988 tok->cur = tok->inp;
989 return EOF;
990 }
991 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200992 tok->cur = tok->buf + cur;
993 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000994 tok->inp = tok->buf + curvalid;
995 tok->end = tok->buf + newsize;
996 tok->start = curstart < 0 ? NULL :
997 tok->buf + curstart;
998 if (decoding_fgets(tok->inp,
999 (int)(tok->end - tok->inp),
1000 tok) == NULL) {
1001 /* Break out early on decoding
1002 errors, as tok->buf will be NULL
1003 */
1004 if (tok->decoding_erred)
1005 return EOF;
1006 /* Last line does not end in \n,
1007 fake one */
1008 strcpy(tok->inp, "\n");
1009 }
1010 tok->inp = strchr(tok->inp, '\0');
1011 done = tok->inp[-1] == '\n';
1012 }
1013 if (tok->buf != NULL) {
1014 tok->cur = tok->buf + cur;
1015 tok->line_start = tok->cur;
1016 /* replace "\r\n" with "\n" */
1017 /* For Mac leave the \r, giving a syntax error */
1018 pt = tok->inp - 2;
1019 if (pt >= tok->buf && *pt == '\r') {
1020 *pt++ = '\n';
1021 *pt = '\0';
1022 tok->inp = pt;
1023 }
1024 }
1025 }
1026 if (tok->done != E_OK) {
1027 if (tok->prompt != NULL)
1028 PySys_WriteStderr("\n");
1029 tok->cur = tok->inp;
1030 return EOF;
1031 }
1032 }
1033 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001034}
1035
1036
1037/* Back-up one character */
1038
1039static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001040tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 if (c != EOF) {
1043 if (--tok->cur < tok->buf)
1044 Py_FatalError("tok_backup: beginning of buffer");
1045 if (*tok->cur != c)
1046 *tok->cur = c;
1047 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048}
1049
1050
Guido van Rossum926f13a1998-04-09 21:38:06 +00001051static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001052syntaxerror(struct tok_state *tok, const char *format, ...)
1053{
1054#ifndef PGEN
1055 va_list vargs;
1056#ifdef HAVE_STDARG_PROTOTYPES
1057 va_start(vargs, format);
1058#else
1059 va_start(vargs);
1060#endif
1061 PyErr_FormatV(PyExc_SyntaxError, format, vargs);
1062 va_end(vargs);
1063 PyErr_SyntaxLocationObject(tok->filename,
1064 tok->lineno,
Victor Stinnerc8846162018-07-21 03:36:06 +02001065 (int)(tok->cur - tok->line_start));
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001066 tok->done = E_ERROR;
1067#else
1068 tok->done = E_TOKEN;
1069#endif
1070 return ERRORTOKEN;
1071}
1072
1073static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001074indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001075{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001076 tok->done = E_TABSPACE;
1077 tok->cur = tok->inp;
1078 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001079}
1080
Martin v. Löwis47383402007-08-15 07:32:56 +00001081#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001082#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001083#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084/* Verify that the identifier follows PEP 3131.
1085 All identifier strings are guaranteed to be "ready" unicode objects.
1086 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001087static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001088verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001089{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 PyObject *s;
1091 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001092 if (tok->decoding_erred)
1093 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001095 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001096 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1097 PyErr_Clear();
1098 tok->done = E_IDENTIFIER;
1099 } else {
1100 tok->done = E_ERROR;
1101 }
1102 return 0;
1103 }
1104 result = PyUnicode_IsIdentifier(s);
1105 Py_DECREF(s);
1106 if (result == 0)
1107 tok->done = E_IDENTIFIER;
1108 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001109}
1110#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001111
Brett Cannona721aba2016-09-09 14:57:09 -07001112static int
1113tok_decimal_tail(struct tok_state *tok)
1114{
1115 int c;
1116
1117 while (1) {
1118 do {
1119 c = tok_nextc(tok);
1120 } while (isdigit(c));
1121 if (c != '_') {
1122 break;
1123 }
1124 c = tok_nextc(tok);
1125 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001126 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001127 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001128 return 0;
1129 }
1130 }
1131 return c;
1132}
1133
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134/* Get next token, after space stripping etc. */
1135
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001136static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001137tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001139 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001141
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 tok->start = NULL;
1145 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 /* Get indentation level */
1148 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001149 int col = 0;
1150 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 tok->atbol = 0;
1152 for (;;) {
1153 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001154 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001156 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001158 col = (col / tok->tabsize + 1) * tok->tabsize;
1159 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 }
Brett Cannona721aba2016-09-09 14:57:09 -07001161 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001163 }
1164 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001166 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167 }
1168 tok_backup(tok, c);
1169 if (c == '#' || c == '\n') {
1170 /* Lines with only whitespace and/or comments
1171 shouldn't affect the indentation and are
1172 not passed to the parser as NEWLINE tokens,
1173 except *totally* empty lines in interactive
1174 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001175 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001176 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001177 }
1178 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001180 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 /* We can't jump back right here since we still
1182 may need to skip to the end of a comment */
1183 }
1184 if (!blankline && tok->level == 0) {
1185 if (col == tok->indstack[tok->indent]) {
1186 /* No change */
1187 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001188 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 }
1190 }
1191 else if (col > tok->indstack[tok->indent]) {
1192 /* Indent -- always one */
1193 if (tok->indent+1 >= MAXINDENT) {
1194 tok->done = E_TOODEEP;
1195 tok->cur = tok->inp;
1196 return ERRORTOKEN;
1197 }
1198 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001199 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 }
1201 tok->pendin++;
1202 tok->indstack[++tok->indent] = col;
1203 tok->altindstack[tok->indent] = altcol;
1204 }
1205 else /* col < tok->indstack[tok->indent] */ {
1206 /* Dedent -- any number, must be consistent */
1207 while (tok->indent > 0 &&
1208 col < tok->indstack[tok->indent]) {
1209 tok->pendin--;
1210 tok->indent--;
1211 }
1212 if (col != tok->indstack[tok->indent]) {
1213 tok->done = E_DEDENT;
1214 tok->cur = tok->inp;
1215 return ERRORTOKEN;
1216 }
1217 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001218 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 }
1220 }
1221 }
1222 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001223
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001225
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 /* Return pending indents/dedents */
1227 if (tok->pendin != 0) {
1228 if (tok->pendin < 0) {
1229 tok->pendin++;
1230 return DEDENT;
1231 }
1232 else {
1233 tok->pendin--;
1234 return INDENT;
1235 }
1236 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001237
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001238 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 tok->start = NULL;
1240 /* Skip spaces */
1241 do {
1242 c = tok_nextc(tok);
1243 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 /* Set start of current token */
1246 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001247
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 /* Skip comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001249 if (c == '#') {
1250 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001252 }
1253 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255 /* Check for EOF and errors now */
1256 if (c == EOF) {
1257 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1258 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 /* Identifier (most frequent token!) */
1261 nonascii = 0;
1262 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001263 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001264 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001265 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001266 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001267 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001268 /* Since this is a backwards compatibility support literal we don't
1269 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001270 else if (!(saw_b || saw_u || saw_r || saw_f)
1271 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001272 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001273 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001274 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001275 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001276 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001277 }
1278 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001279 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001280 }
1281 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001282 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001283 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001285 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001287 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 }
1289 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001290 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001292 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 c = tok_nextc(tok);
1294 }
1295 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001296 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001298 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 *p_start = tok->start;
1300 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001301
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 return NAME;
1303 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 /* Newline */
1306 if (c == '\n') {
1307 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001308 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001310 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 *p_start = tok->start;
1312 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1313 tok->cont_line = 0;
1314 return NEWLINE;
1315 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001316
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 /* Period or number starting with period? */
1318 if (c == '.') {
1319 c = tok_nextc(tok);
1320 if (isdigit(c)) {
1321 goto fraction;
1322 } else if (c == '.') {
1323 c = tok_nextc(tok);
1324 if (c == '.') {
1325 *p_start = tok->start;
1326 *p_end = tok->cur;
1327 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001328 }
1329 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 tok_backup(tok, c);
1331 }
1332 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001333 }
1334 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 tok_backup(tok, c);
1336 }
1337 *p_start = tok->start;
1338 *p_end = tok->cur;
1339 return DOT;
1340 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001341
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 /* Number */
1343 if (isdigit(c)) {
1344 if (c == '0') {
1345 /* Hex, octal or binary -- maybe. */
1346 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001347 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 /* Hex */
1349 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001351 if (c == '_') {
1352 c = tok_nextc(tok);
1353 }
1354 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001355 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001356 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001357 }
1358 do {
1359 c = tok_nextc(tok);
1360 } while (isxdigit(c));
1361 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 }
1363 else if (c == 'o' || c == 'O') {
1364 /* Octal */
1365 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001367 if (c == '_') {
1368 c = tok_nextc(tok);
1369 }
1370 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001371 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001372 if (isdigit(c)) {
1373 return syntaxerror(tok,
1374 "invalid digit '%c' in octal literal", c);
1375 }
1376 else {
1377 return syntaxerror(tok, "invalid octal literal");
1378 }
Brett Cannona721aba2016-09-09 14:57:09 -07001379 }
1380 do {
1381 c = tok_nextc(tok);
1382 } while ('0' <= c && c < '8');
1383 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001384 if (isdigit(c)) {
1385 return syntaxerror(tok,
1386 "invalid digit '%c' in octal literal", c);
1387 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 }
1389 else if (c == 'b' || c == 'B') {
1390 /* Binary */
1391 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001393 if (c == '_') {
1394 c = tok_nextc(tok);
1395 }
1396 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001397 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001398 if (isdigit(c)) {
1399 return syntaxerror(tok,
1400 "invalid digit '%c' in binary literal", c);
1401 }
1402 else {
1403 return syntaxerror(tok, "invalid binary literal");
1404 }
Brett Cannona721aba2016-09-09 14:57:09 -07001405 }
1406 do {
1407 c = tok_nextc(tok);
1408 } while (c == '0' || c == '1');
1409 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001410 if (isdigit(c)) {
1411 return syntaxerror(tok,
1412 "invalid digit '%c' in binary literal", c);
1413 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 }
1415 else {
1416 int nonzero = 0;
1417 /* maybe old-style octal; c is first char of it */
1418 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001419 while (1) {
1420 if (c == '_') {
1421 c = tok_nextc(tok);
1422 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001423 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001424 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001425 }
1426 }
1427 if (c != '0') {
1428 break;
1429 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 c = tok_nextc(tok);
1431 }
Brett Cannona721aba2016-09-09 14:57:09 -07001432 if (isdigit(c)) {
1433 nonzero = 1;
1434 c = tok_decimal_tail(tok);
1435 if (c == 0) {
1436 return ERRORTOKEN;
1437 }
1438 }
1439 if (c == '.') {
1440 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001442 }
1443 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001445 }
1446 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001448 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001450 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001452 return syntaxerror(tok,
1453 "leading zeros in decimal integer "
1454 "literals are not permitted; "
1455 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 }
1457 }
1458 }
1459 else {
1460 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001461 c = tok_decimal_tail(tok);
1462 if (c == 0) {
1463 return ERRORTOKEN;
1464 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 {
1466 /* Accept floating point numbers. */
1467 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001468 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 fraction:
1470 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001471 if (isdigit(c)) {
1472 c = tok_decimal_tail(tok);
1473 if (c == 0) {
1474 return ERRORTOKEN;
1475 }
1476 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 }
1478 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001479 int e;
1480 exponent:
1481 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 /* Exponent part */
1483 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001484 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001486 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001487 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001488 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001489 }
1490 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001492 tok_backup(tok, e);
1493 *p_start = tok->start;
1494 *p_end = tok->cur;
1495 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 }
Brett Cannona721aba2016-09-09 14:57:09 -07001497 c = tok_decimal_tail(tok);
1498 if (c == 0) {
1499 return ERRORTOKEN;
1500 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 }
Brett Cannona721aba2016-09-09 14:57:09 -07001502 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001503 /* Imaginary part */
1504 imaginary:
1505 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001506 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 }
1508 }
1509 tok_backup(tok, c);
1510 *p_start = tok->start;
1511 *p_end = tok->cur;
1512 return NUMBER;
1513 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001514
1515 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001516 /* String */
1517 if (c == '\'' || c == '"') {
1518 int quote = c;
1519 int quote_size = 1; /* 1 or 3 */
1520 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001521
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001522 /* Find the quote size and start of string */
1523 c = tok_nextc(tok);
1524 if (c == quote) {
1525 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001526 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001528 }
1529 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001530 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001531 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 }
Brett Cannona721aba2016-09-09 14:57:09 -07001533 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001535 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001536
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 /* Get rest of string */
1538 while (end_quote_size != quote_size) {
1539 c = tok_nextc(tok);
1540 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001541 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001543 }
1544 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001546 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001547 tok->cur = tok->inp;
1548 return ERRORTOKEN;
1549 }
1550 if (quote_size == 1 && c == '\n') {
1551 tok->done = E_EOLS;
1552 tok->cur = tok->inp;
1553 return ERRORTOKEN;
1554 }
Brett Cannona721aba2016-09-09 14:57:09 -07001555 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001556 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001557 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 else {
1559 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001560 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001561 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001562 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 }
1564 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001565
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001566 *p_start = tok->start;
1567 *p_end = tok->cur;
1568 return STRING;
1569 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001570
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001571 /* Line continuation */
1572 if (c == '\\') {
1573 c = tok_nextc(tok);
1574 if (c != '\n') {
1575 tok->done = E_LINECONT;
1576 tok->cur = tok->inp;
1577 return ERRORTOKEN;
1578 }
1579 tok->cont_line = 1;
1580 goto again; /* Read next line */
1581 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001582
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001583 /* Check for two-character token */
1584 {
1585 int c2 = tok_nextc(tok);
1586 int token = PyToken_TwoChars(c, c2);
1587 if (token != OP) {
1588 int c3 = tok_nextc(tok);
1589 int token3 = PyToken_ThreeChars(c, c2, c3);
1590 if (token3 != OP) {
1591 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001592 }
1593 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 tok_backup(tok, c3);
1595 }
1596 *p_start = tok->start;
1597 *p_end = tok->cur;
1598 return token;
1599 }
1600 tok_backup(tok, c2);
1601 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001602
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 /* Keep track of parentheses nesting level */
1604 switch (c) {
1605 case '(':
1606 case '[':
1607 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001608#ifndef PGEN
1609 if (tok->level >= MAXLEVEL) {
1610 return syntaxerror(tok, "too many nested parentheses");
1611 }
1612 tok->parenstack[tok->level] = c;
1613 tok->parenlinenostack[tok->level] = tok->lineno;
1614#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001615 tok->level++;
1616 break;
1617 case ')':
1618 case ']':
1619 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001620#ifndef PGEN
1621 if (!tok->level) {
1622 return syntaxerror(tok, "unmatched '%c'", c);
1623 }
1624#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001626#ifndef PGEN
1627 int opening = tok->parenstack[tok->level];
1628 if (!((opening == '(' && c == ')') ||
1629 (opening == '[' && c == ']') ||
1630 (opening == '{' && c == '}')))
1631 {
1632 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1633 return syntaxerror(tok,
1634 "closing parenthesis '%c' does not match "
1635 "opening parenthesis '%c' on line %d",
1636 c, opening, tok->parenlinenostack[tok->level]);
1637 }
1638 else {
1639 return syntaxerror(tok,
1640 "closing parenthesis '%c' does not match "
1641 "opening parenthesis '%c'",
1642 c, opening);
1643 }
1644 }
1645#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 break;
1647 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001648
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001649 /* Punctuation character */
1650 *p_start = tok->start;
1651 *p_end = tok->cur;
1652 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001653}
1654
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001655int
1656PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1657{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 int result = tok_get(tok, p_start, p_end);
1659 if (tok->decoding_erred) {
1660 result = ERRORTOKEN;
1661 tok->done = E_DECODE;
1662 }
1663 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001664}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001665
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001666/* Get the encoding of a Python file. Check for the coding cookie and check if
1667 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001668
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001669 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1670 encoding in the first or second line of the file (in which case the encoding
1671 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001672
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001673 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1674 by the caller. */
1675
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001676char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001677PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001678{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 struct tok_state *tok;
1680 FILE *fp;
1681 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001682
Victor Stinnerdaf45552013-08-28 00:53:59 +02001683#ifndef PGEN
1684 fd = _Py_dup(fd);
1685#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001687#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 if (fd < 0) {
1689 return NULL;
1690 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001691
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 fp = fdopen(fd, "r");
1693 if (fp == NULL) {
1694 return NULL;
1695 }
1696 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1697 if (tok == NULL) {
1698 fclose(fp);
1699 return NULL;
1700 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001701#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001702 if (filename != NULL) {
1703 Py_INCREF(filename);
1704 tok->filename = filename;
1705 }
1706 else {
1707 tok->filename = PyUnicode_FromString("<string>");
1708 if (tok->filename == NULL) {
1709 fclose(fp);
1710 PyTokenizer_Free(tok);
1711 return encoding;
1712 }
1713 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001714#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 while (tok->lineno < 2 && tok->done == E_OK) {
1716 PyTokenizer_Get(tok, &p_start, &p_end);
1717 }
1718 fclose(fp);
1719 if (tok->encoding) {
1720 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1721 if (encoding)
1722 strcpy(encoding, tok->encoding);
1723 }
1724 PyTokenizer_Free(tok);
1725 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001726}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001727
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001728char *
1729PyTokenizer_FindEncoding(int fd)
1730{
1731 return PyTokenizer_FindEncodingFilename(fd, NULL);
1732}
1733
Guido van Rossum408027e1996-12-30 16:17:54 +00001734#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001735
1736void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001737tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001738{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001739 printf("%s", _PyParser_TokenNames[type]);
1740 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1741 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001742}
1743
1744#endif