blob: 1ded9ade377156a0f9602da688a098190cb726a2 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080021/* Alternate tab spacing */
22#define ALTTABSIZE 1
23
Martin v. Löwis5b222132007-06-10 09:51:05 +000024#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000025 (c >= 'a' && c <= 'z')\
26 || (c >= 'A' && c <= 'Z')\
27 || c == '_'\
28 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000029
30#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 (c >= 'a' && c <= 'z')\
32 || (c >= 'A' && c <= 'Z')\
33 || (c >= '0' && c <= '9')\
34 || c == '_'\
35 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000036
Serhiy Storchakac6792272013-10-19 21:03:34 +030037extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000038/* Return malloc'ed string including trailing \n;
39 empty malloc'ed string for EOF;
40 NULL if interrupted */
41
Guido van Rossum4fe87291992-02-26 15:24:44 +000042/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044
Guido van Rossum3f5da241990-12-20 15:06:42 +000045/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000046static struct tok_state *tok_new(void);
47static int tok_nextc(struct tok_state *tok);
48static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000049
Brett Cannond5ec98c2007-10-20 02:54:14 +000050
Guido van Rossumdcfcd142019-01-31 03:40:27 -080051/* Spaces in this constant are treated as "zero or more spaces or tabs" when
52 tokenizing. */
53static const char* type_comment_prefix = "# type: ";
54
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055/* Create and initialize a new tok_state structure */
56
57static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000058tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000060 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
61 sizeof(struct tok_state));
62 if (tok == NULL)
63 return NULL;
64 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
65 tok->done = E_OK;
66 tok->fp = NULL;
67 tok->input = NULL;
68 tok->tabsize = TABSIZE;
69 tok->indent = 0;
70 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -040071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000072 tok->atbol = 1;
73 tok->pendin = 0;
74 tok->prompt = tok->nextprompt = NULL;
75 tok->lineno = 0;
76 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 tok->altindstack[0] = 0;
78 tok->decoding_state = STATE_INIT;
79 tok->decoding_erred = 0;
80 tok->read_coding_spec = 0;
81 tok->enc = NULL;
82 tok->encoding = NULL;
83 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000084#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +020085 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000086 tok->decoding_readline = NULL;
87 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000088#endif
Guido van Rossumdcfcd142019-01-31 03:40:27 -080089 tok->type_comments = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +030090
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000092}
93
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000094static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070095new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000096{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000097 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -070098 if (!result) {
99 tok->done = E_NOMEM;
100 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700102 memcpy(result, s, len);
103 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000105}
106
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000107#ifdef PGEN
108
109static char *
110decoding_fgets(char *s, int size, struct tok_state *tok)
111{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000113}
114
115static int
116decoding_feof(struct tok_state *tok)
117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000119}
120
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000121static char *
122decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000123{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700124 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000125}
126
127#else /* PGEN */
128
129static char *
130error_ret(struct tok_state *tok) /* XXX */
131{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 tok->decoding_erred = 1;
133 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
134 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200135 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
136 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138}
139
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200141static const char *
142get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 char buf[13];
145 int i;
146 for (i = 0; i < 12; i++) {
147 int c = s[i];
148 if (c == '\0')
149 break;
150 else if (c == '_')
151 buf[i] = '-';
152 else
153 buf[i] = tolower(c);
154 }
155 buf[i] = '\0';
156 if (strcmp(buf, "utf-8") == 0 ||
157 strncmp(buf, "utf-8-", 6) == 0)
158 return "utf-8";
159 else if (strcmp(buf, "latin-1") == 0 ||
160 strcmp(buf, "iso-8859-1") == 0 ||
161 strcmp(buf, "iso-latin-1") == 0 ||
162 strncmp(buf, "latin-1-", 8) == 0 ||
163 strncmp(buf, "iso-8859-1-", 11) == 0 ||
164 strncmp(buf, "iso-latin-1-", 12) == 0)
165 return "iso-8859-1";
166 else
167 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170/* Return the coding spec in S, or NULL if none is found. */
171
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700172static int
173get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700176 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 /* Coding spec must be in a comment, and that comment must be
178 * the only statement on the source code line. */
179 for (i = 0; i < size - 6; i++) {
180 if (s[i] == '#')
181 break;
182 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700183 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 }
185 for (; i < size - 6; i++) { /* XXX inefficient search */
186 const char* t = s + i;
187 if (strncmp(t, "coding", 6) == 0) {
188 const char* begin = NULL;
189 t += 6;
190 if (t[0] != ':' && t[0] != '=')
191 continue;
192 do {
193 t++;
194 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 begin = t;
197 while (Py_ISALNUM(t[0]) ||
198 t[0] == '-' || t[0] == '_' || t[0] == '.')
199 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000200
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700202 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200203 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700204 if (!r)
205 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700206 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 if (r != q) {
208 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700209 r = new_string(q, strlen(q), tok);
210 if (!r)
211 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000212 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700213 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200214 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215 }
216 }
217 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700218 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000219}
220
221/* Check whether the line contains a coding spec. If it does,
222 invoke the set_readline function for the new encoding.
223 This function receives the tok_state and the new encoding.
224 Return 1 on success, 0 on failure. */
225
226static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000229{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000232
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200233 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000234 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200235 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200237 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700238 if (!get_coding_spec(line, &cs, size, tok))
239 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200240 if (!cs) {
241 Py_ssize_t i;
242 for (i = 0; i < size; i++) {
243 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
244 break;
245 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
246 /* Stop checking coding spec after a line containing
247 * anything except a comment. */
248 tok->read_coding_spec = 1;
249 break;
250 }
251 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700252 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200253 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700254 tok->read_coding_spec = 1;
255 if (tok->encoding == NULL) {
256 assert(tok->decoding_state == STATE_RAW);
257 if (strcmp(cs, "utf-8") == 0) {
258 tok->encoding = cs;
259 } else {
260 r = set_readline(tok, cs);
261 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000264 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700265 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300266 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 "encoding problem: %s", cs);
268 PyMem_FREE(cs);
269 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 } else { /* then, compare cs with BOM */
272 r = (strcmp(tok->encoding, cs) == 0);
273 if (!r)
274 PyErr_Format(PyExc_SyntaxError,
275 "encoding problem: %s with BOM", cs);
276 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000277 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000279}
280
281/* See whether the file starts with a BOM. If it does,
282 invoke the set_readline function with the new encoding.
283 Return 1 on success, 0 on failure. */
284
285static int
286check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 void unget_char(int, struct tok_state *),
288 int set_readline(struct tok_state *, const char *),
289 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000291 int ch1, ch2, ch3;
292 ch1 = get_char(tok);
293 tok->decoding_state = STATE_RAW;
294 if (ch1 == EOF) {
295 return 1;
296 } else if (ch1 == 0xEF) {
297 ch2 = get_char(tok);
298 if (ch2 != 0xBB) {
299 unget_char(ch2, tok);
300 unget_char(ch1, tok);
301 return 1;
302 }
303 ch3 = get_char(tok);
304 if (ch3 != 0xBF) {
305 unget_char(ch3, tok);
306 unget_char(ch2, tok);
307 unget_char(ch1, tok);
308 return 1;
309 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000310#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000311 /* Disable support for UTF-16 BOMs until a decision
312 is made whether this needs to be supported. */
313 } else if (ch1 == 0xFE) {
314 ch2 = get_char(tok);
315 if (ch2 != 0xFF) {
316 unget_char(ch2, tok);
317 unget_char(ch1, tok);
318 return 1;
319 }
320 if (!set_readline(tok, "utf-16-be"))
321 return 0;
322 tok->decoding_state = STATE_NORMAL;
323 } else if (ch1 == 0xFF) {
324 ch2 = get_char(tok);
325 if (ch2 != 0xFE) {
326 unget_char(ch2, tok);
327 unget_char(ch1, tok);
328 return 1;
329 }
330 if (!set_readline(tok, "utf-16-le"))
331 return 0;
332 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 } else {
335 unget_char(ch1, tok);
336 return 1;
337 }
338 if (tok->encoding != NULL)
339 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700340 tok->encoding = new_string("utf-8", 5, tok);
341 if (!tok->encoding)
342 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000343 /* No need to set_readline: input is already utf-8 */
344 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000348 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000353 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000354 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
358 until the buffer ends with a '\n' (or until the end of the file is
359 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000360*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 PyObject* bufobj;
366 const char *buf;
367 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000369 /* Ask for one less byte so we can terminate it */
370 assert(size > 0);
371 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 if (tok->decoding_buffer) {
374 bufobj = tok->decoding_buffer;
375 Py_INCREF(bufobj);
376 }
377 else
378 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100379 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000380 if (bufobj == NULL)
381 goto error;
382 }
383 if (PyUnicode_CheckExact(bufobj))
384 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200385 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 if (buf == NULL) {
387 goto error;
388 }
389 }
390 else
391 {
392 buf = PyByteArray_AsString(bufobj);
393 if (buf == NULL) {
394 goto error;
395 }
396 buflen = PyByteArray_GET_SIZE(bufobj);
397 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000398
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399 Py_XDECREF(tok->decoding_buffer);
400 if (buflen > size) {
401 /* Too many chars, the rest goes into tok->decoding_buffer */
402 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
403 buflen-size);
404 if (tok->decoding_buffer == NULL)
405 goto error;
406 buflen = size;
407 }
408 else
409 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000411 memcpy(s, buf, buflen);
412 s[buflen] = '\0';
413 if (buflen == 0) /* EOF */
414 s = NULL;
415 Py_DECREF(bufobj);
416 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000417
418error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 Py_XDECREF(bufobj);
420 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000421}
422
423/* Set the readline function for TOK to a StreamReader's
424 readline function. The StreamReader is named ENC.
425
426 This function is called from check_bom and check_coding_spec.
427
428 ENC is usually identical to the future value of tok->encoding,
429 except for the (currently unsupported) case of UTF-16.
430
431 Return 1 on success, 0 on failure. */
432
433static int
434fp_setreadl(struct tok_state *tok, const char* enc)
435{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700436 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200437 _Py_IDENTIFIER(open);
438 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000439 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200440 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441
Victor Stinner22a351a2010-10-14 12:04:34 +0000442 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200443 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100444 * position of tok->fp. If tok->fp was opened in text mode on Windows,
445 * its file position counts CRLF as one char and can't be directly mapped
446 * to the file offset for fd. Instead we step back one byte and read to
447 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200448 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100449 if (pos == -1 ||
450 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000451 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700452 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000453 }
454
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700455 io = PyImport_ImportModuleNoBlock("io");
456 if (io == NULL)
457 return 0;
458
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200459 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000460 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700461 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700463 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200465 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700466 Py_DECREF(stream);
467 if (readline == NULL)
468 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300469 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700470
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100471 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100472 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700473 if (bufobj == NULL)
474 return 0;
475 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100476 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700478 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479}
480
481/* Fetch the next byte from TOK. */
482
483static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000484 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485}
486
487/* Unfetch the last byte back into TOK. */
488
489static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000490 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000491}
492
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000493/* Check whether the characters at s start a valid
494 UTF-8 sequence. Return the number of characters forming
495 the sequence if yes, 0 if not. */
496static int valid_utf8(const unsigned char* s)
497{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 int expected = 0;
499 int length;
500 if (*s < 0x80)
501 /* single-byte code */
502 return 1;
503 if (*s < 0xc0)
504 /* following byte */
505 return 0;
506 if (*s < 0xE0)
507 expected = 1;
508 else if (*s < 0xF0)
509 expected = 2;
510 else if (*s < 0xF8)
511 expected = 3;
512 else
513 return 0;
514 length = expected + 1;
515 for (; expected; expected--)
516 if (s[expected] < 0x80 || s[expected] >= 0xC0)
517 return 0;
518 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000519}
520
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521/* Read a line of input from TOK. Determine encoding
522 if necessary. */
523
524static char *
525decoding_fgets(char *s, int size, struct tok_state *tok)
526{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000527 char *line = NULL;
528 int badchar = 0;
529 for (;;) {
530 if (tok->decoding_state == STATE_NORMAL) {
531 /* We already have a codec associated with
532 this input. */
533 line = fp_readl(s, size, tok);
534 break;
535 } else if (tok->decoding_state == STATE_RAW) {
536 /* We want a 'raw' read. */
537 line = Py_UniversalNewlineFgets(s, size,
538 tok->fp, NULL);
539 break;
540 } else {
541 /* We have not yet determined the encoding.
542 If an encoding is found, use the file-pointer
543 reader functions from now on. */
544 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
545 return error_ret(tok);
546 assert(tok->decoding_state != STATE_INIT);
547 }
548 }
549 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
550 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
551 return error_ret(tok);
552 }
553 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000555 /* The default encoding is UTF-8, so make sure we don't have any
556 non-UTF-8 sequences in it. */
557 if (line && !tok->encoding) {
558 unsigned char *c;
559 int length;
560 for (c = (unsigned char *)line; *c; c += length)
561 if (!(length = valid_utf8(c))) {
562 badchar = *c;
563 break;
564 }
565 }
566 if (badchar) {
567 /* Need to add 1 to the line number, since this line
568 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200569 PyErr_Format(PyExc_SyntaxError,
570 "Non-UTF-8 code starting with '\\x%.2x' "
571 "in file %U on line %i, "
572 "but no encoding declared; "
573 "see http://python.org/dev/peps/pep-0263/ for details",
574 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 return error_ret(tok);
576 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000578 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579}
580
581static int
582decoding_feof(struct tok_state *tok)
583{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 if (tok->decoding_state != STATE_NORMAL) {
585 return feof(tok->fp);
586 } else {
587 PyObject* buf = tok->decoding_buffer;
588 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100589 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000590 if (buf == NULL) {
591 error_ret(tok);
592 return 1;
593 } else {
594 tok->decoding_buffer = buf;
595 }
596 }
597 return PyObject_Length(buf) == 0;
598 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599}
600
601/* Fetch a byte from TOK, using the string buffer. */
602
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000603static int
604buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000605 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606}
607
608/* Unfetch a byte from TOK, using the string buffer. */
609
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000610static void
611buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000612 tok->str--;
613 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614}
615
616/* Set the readline function for TOK to ENC. For the string-based
617 tokenizer, this means to just record the encoding. */
618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000619static int
620buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 tok->enc = enc;
622 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623}
624
625/* Return a UTF-8 encoding Python string object from the
626 C byte string STR, which is encoded with ENC. */
627
628static PyObject *
629translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 PyObject *utf8;
631 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
632 if (buf == NULL)
633 return NULL;
634 utf8 = PyUnicode_AsUTF8String(buf);
635 Py_DECREF(buf);
636 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637}
638
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000639
640static char *
641translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200642 int skip_next_lf = 0;
643 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 char *buf, *current;
645 char c = '\0';
646 buf = PyMem_MALLOC(needed_length);
647 if (buf == NULL) {
648 tok->done = E_NOMEM;
649 return NULL;
650 }
651 for (current = buf; *s; s++, current++) {
652 c = *s;
653 if (skip_next_lf) {
654 skip_next_lf = 0;
655 if (c == '\n') {
656 c = *++s;
657 if (!c)
658 break;
659 }
660 }
661 if (c == '\r') {
662 skip_next_lf = 1;
663 c = '\n';
664 }
665 *current = c;
666 }
667 /* If this is exec input, add a newline to the end of the string if
668 there isn't one already. */
669 if (exec_input && c != '\n') {
670 *current = '\n';
671 current++;
672 }
673 *current = '\0';
674 final_length = current - buf + 1;
675 if (final_length < needed_length && final_length)
676 /* should never fail */
677 buf = PyMem_REALLOC(buf, final_length);
678 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000679}
680
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000681/* Decode a byte string STR for use as the buffer of TOK.
682 Look for encoding declarations inside STR, and record them
683 inside TOK. */
684
685static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000686decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000687{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000688 PyObject* utf8 = NULL;
689 const char *str;
690 const char *s;
691 const char *newl[2] = {NULL, NULL};
692 int lineno = 0;
693 tok->input = str = translate_newlines(input, single, tok);
694 if (str == NULL)
695 return NULL;
696 tok->enc = NULL;
697 tok->str = str;
698 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
699 return error_ret(tok);
700 str = tok->str; /* string after BOM if any */
701 assert(str);
702 if (tok->enc != NULL) {
703 utf8 = translate_into_utf8(str, tok->enc);
704 if (utf8 == NULL)
705 return error_ret(tok);
706 str = PyBytes_AsString(utf8);
707 }
708 for (s = str;; s++) {
709 if (*s == '\0') break;
710 else if (*s == '\n') {
711 assert(lineno < 2);
712 newl[lineno] = s;
713 lineno++;
714 if (lineno == 2) break;
715 }
716 }
717 tok->enc = NULL;
718 /* need to check line 1 and 2 separately since check_coding_spec
719 assumes a single line as input */
720 if (newl[0]) {
721 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
722 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200723 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
725 tok, buf_setreadl))
726 return error_ret(tok);
727 }
728 }
729 if (tok->enc != NULL) {
730 assert(utf8 == NULL);
731 utf8 = translate_into_utf8(str, tok->enc);
732 if (utf8 == NULL)
733 return error_ret(tok);
734 str = PyBytes_AS_STRING(utf8);
735 }
736 assert(tok->decoding_buffer == NULL);
737 tok->decoding_buffer = utf8; /* CAUTION */
738 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000739}
740
741#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742
743/* Set up tokenizer for string */
744
745struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000746PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 struct tok_state *tok = tok_new();
749 if (tok == NULL)
750 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300751 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 if (str == NULL) {
753 PyTokenizer_Free(tok);
754 return NULL;
755 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000756
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 /* XXX: constify members. */
758 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
759 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760}
761
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000762struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000764{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 struct tok_state *tok = tok_new();
766 if (tok == NULL)
767 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000768#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000770#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 if (str == NULL) {
772 PyTokenizer_Free(tok);
773 return NULL;
774 }
775 tok->decoding_state = STATE_RAW;
776 tok->read_coding_spec = 1;
777 tok->enc = NULL;
778 tok->str = str;
779 tok->encoding = (char *)PyMem_MALLOC(6);
780 if (!tok->encoding) {
781 PyTokenizer_Free(tok);
782 return NULL;
783 }
784 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000785
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 /* XXX: constify members. */
787 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
788 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000789}
790
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000791/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000792
793struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300794PyTokenizer_FromFile(FILE *fp, const char* enc,
795 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000796{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 struct tok_state *tok = tok_new();
798 if (tok == NULL)
799 return NULL;
800 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
801 PyTokenizer_Free(tok);
802 return NULL;
803 }
804 tok->cur = tok->inp = tok->buf;
805 tok->end = tok->buf + BUFSIZ;
806 tok->fp = fp;
807 tok->prompt = ps1;
808 tok->nextprompt = ps2;
809 if (enc != NULL) {
810 /* Must copy encoding declaration since it
811 gets copied into the parse tree. */
812 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
813 if (!tok->encoding) {
814 PyTokenizer_Free(tok);
815 return NULL;
816 }
817 strcpy(tok->encoding, enc);
818 tok->decoding_state = STATE_NORMAL;
819 }
820 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821}
822
823
824/* Free a tok_state structure */
825
826void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000827PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 if (tok->encoding != NULL)
830 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000831#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 Py_XDECREF(tok->decoding_readline);
833 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200834 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000835#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 if (tok->fp != NULL && tok->buf != NULL)
837 PyMem_FREE(tok->buf);
838 if (tok->input)
839 PyMem_FREE((char *)tok->input);
840 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000841}
842
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843/* Get next char, updating state; error code goes into tok->done */
844
845static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200846tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 for (;;) {
849 if (tok->cur != tok->inp) {
850 return Py_CHARMASK(*tok->cur++); /* Fast path */
851 }
852 if (tok->done != E_OK)
853 return EOF;
854 if (tok->fp == NULL) {
855 char *end = strchr(tok->inp, '\n');
856 if (end != NULL)
857 end++;
858 else {
859 end = strchr(tok->inp, '\0');
860 if (end == tok->inp) {
861 tok->done = E_EOF;
862 return EOF;
863 }
864 }
865 if (tok->start == NULL)
866 tok->buf = tok->cur;
867 tok->line_start = tok->cur;
868 tok->lineno++;
869 tok->inp = end;
870 return Py_CHARMASK(*tok->cur++);
871 }
872 if (tok->prompt != NULL) {
873 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000874#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000875 if (newtok != NULL) {
876 char *translated = translate_newlines(newtok, 0, tok);
877 PyMem_FREE(newtok);
878 if (translated == NULL)
879 return EOF;
880 newtok = translated;
881 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 if (tok->encoding && newtok && *newtok) {
883 /* Recode to UTF-8 */
884 Py_ssize_t buflen;
885 const char* buf;
886 PyObject *u = translate_into_utf8(newtok, tok->encoding);
887 PyMem_FREE(newtok);
888 if (!u) {
889 tok->done = E_DECODE;
890 return EOF;
891 }
892 buflen = PyBytes_GET_SIZE(u);
893 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz4c49da02018-12-07 03:11:30 -0700895 if (newtok == NULL) {
896 Py_DECREF(u);
897 tok->done = E_NOMEM;
898 return EOF;
899 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 strcpy(newtok, buf);
901 Py_DECREF(u);
902 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000903#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 if (tok->nextprompt != NULL)
905 tok->prompt = tok->nextprompt;
906 if (newtok == NULL)
907 tok->done = E_INTR;
908 else if (*newtok == '\0') {
909 PyMem_FREE(newtok);
910 tok->done = E_EOF;
911 }
912 else if (tok->start != NULL) {
913 size_t start = tok->start - tok->buf;
914 size_t oldlen = tok->cur - tok->buf;
915 size_t newlen = oldlen + strlen(newtok);
916 char *buf = tok->buf;
917 buf = (char *)PyMem_REALLOC(buf, newlen+1);
918 tok->lineno++;
919 if (buf == NULL) {
920 PyMem_FREE(tok->buf);
921 tok->buf = NULL;
922 PyMem_FREE(newtok);
923 tok->done = E_NOMEM;
924 return EOF;
925 }
926 tok->buf = buf;
927 tok->cur = tok->buf + oldlen;
928 tok->line_start = tok->cur;
929 strcpy(tok->buf + oldlen, newtok);
930 PyMem_FREE(newtok);
931 tok->inp = tok->buf + newlen;
932 tok->end = tok->inp + 1;
933 tok->start = tok->buf + start;
934 }
935 else {
936 tok->lineno++;
937 if (tok->buf != NULL)
938 PyMem_FREE(tok->buf);
939 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000940 tok->cur = tok->buf;
941 tok->line_start = tok->buf;
942 tok->inp = strchr(tok->buf, '\0');
943 tok->end = tok->inp + 1;
944 }
945 }
946 else {
947 int done = 0;
948 Py_ssize_t cur = 0;
949 char *pt;
950 if (tok->start == NULL) {
951 if (tok->buf == NULL) {
952 tok->buf = (char *)
953 PyMem_MALLOC(BUFSIZ);
954 if (tok->buf == NULL) {
955 tok->done = E_NOMEM;
956 return EOF;
957 }
958 tok->end = tok->buf + BUFSIZ;
959 }
960 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
961 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200962 if (!tok->decoding_erred)
963 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 done = 1;
965 }
966 else {
967 tok->done = E_OK;
968 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -0700969 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 }
971 }
972 else {
973 cur = tok->cur - tok->buf;
974 if (decoding_feof(tok)) {
975 tok->done = E_EOF;
976 done = 1;
977 }
978 else
979 tok->done = E_OK;
980 }
981 tok->lineno++;
982 /* Read until '\n' or EOF */
983 while (!done) {
984 Py_ssize_t curstart = tok->start == NULL ? -1 :
985 tok->start - tok->buf;
986 Py_ssize_t curvalid = tok->inp - tok->buf;
987 Py_ssize_t newsize = curvalid + BUFSIZ;
988 char *newbuf = tok->buf;
989 newbuf = (char *)PyMem_REALLOC(newbuf,
990 newsize);
991 if (newbuf == NULL) {
992 tok->done = E_NOMEM;
993 tok->cur = tok->inp;
994 return EOF;
995 }
996 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200997 tok->cur = tok->buf + cur;
998 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 tok->inp = tok->buf + curvalid;
1000 tok->end = tok->buf + newsize;
1001 tok->start = curstart < 0 ? NULL :
1002 tok->buf + curstart;
1003 if (decoding_fgets(tok->inp,
1004 (int)(tok->end - tok->inp),
1005 tok) == NULL) {
1006 /* Break out early on decoding
1007 errors, as tok->buf will be NULL
1008 */
1009 if (tok->decoding_erred)
1010 return EOF;
1011 /* Last line does not end in \n,
1012 fake one */
1013 strcpy(tok->inp, "\n");
1014 }
1015 tok->inp = strchr(tok->inp, '\0');
1016 done = tok->inp[-1] == '\n';
1017 }
1018 if (tok->buf != NULL) {
1019 tok->cur = tok->buf + cur;
1020 tok->line_start = tok->cur;
1021 /* replace "\r\n" with "\n" */
1022 /* For Mac leave the \r, giving a syntax error */
1023 pt = tok->inp - 2;
1024 if (pt >= tok->buf && *pt == '\r') {
1025 *pt++ = '\n';
1026 *pt = '\0';
1027 tok->inp = pt;
1028 }
1029 }
1030 }
1031 if (tok->done != E_OK) {
1032 if (tok->prompt != NULL)
1033 PySys_WriteStderr("\n");
1034 tok->cur = tok->inp;
1035 return EOF;
1036 }
1037 }
1038 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039}
1040
1041
1042/* Back-up one character */
1043
1044static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001045tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001046{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047 if (c != EOF) {
1048 if (--tok->cur < tok->buf)
1049 Py_FatalError("tok_backup: beginning of buffer");
1050 if (*tok->cur != c)
1051 *tok->cur = c;
1052 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001053}
1054
1055
Guido van Rossum926f13a1998-04-09 21:38:06 +00001056static int
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001057syntaxerror(struct tok_state *tok, const char *format, ...)
1058{
1059#ifndef PGEN
1060 va_list vargs;
1061#ifdef HAVE_STDARG_PROTOTYPES
1062 va_start(vargs, format);
1063#else
1064 va_start(vargs);
1065#endif
1066 PyErr_FormatV(PyExc_SyntaxError, format, vargs);
1067 va_end(vargs);
1068 PyErr_SyntaxLocationObject(tok->filename,
1069 tok->lineno,
Victor Stinnerc8846162018-07-21 03:36:06 +02001070 (int)(tok->cur - tok->line_start));
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001071 tok->done = E_ERROR;
1072#else
1073 tok->done = E_TOKEN;
1074#endif
1075 return ERRORTOKEN;
1076}
1077
1078static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001079indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001080{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001081 tok->done = E_TABSPACE;
1082 tok->cur = tok->inp;
1083 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001084}
1085
Martin v. Löwis47383402007-08-15 07:32:56 +00001086#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001087#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001088#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089/* Verify that the identifier follows PEP 3131.
1090 All identifier strings are guaranteed to be "ready" unicode objects.
1091 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001092static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001093verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001094{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 PyObject *s;
1096 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001097 if (tok->decoding_erred)
1098 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Zackery Spytz5061a742018-09-10 00:27:31 -06001100 if (s == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1102 PyErr_Clear();
1103 tok->done = E_IDENTIFIER;
1104 } else {
1105 tok->done = E_ERROR;
1106 }
1107 return 0;
1108 }
1109 result = PyUnicode_IsIdentifier(s);
1110 Py_DECREF(s);
1111 if (result == 0)
1112 tok->done = E_IDENTIFIER;
1113 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001114}
1115#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001116
Brett Cannona721aba2016-09-09 14:57:09 -07001117static int
1118tok_decimal_tail(struct tok_state *tok)
1119{
1120 int c;
1121
1122 while (1) {
1123 do {
1124 c = tok_nextc(tok);
1125 } while (isdigit(c));
1126 if (c != '_') {
1127 break;
1128 }
1129 c = tok_nextc(tok);
1130 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001131 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001132 syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001133 return 0;
1134 }
1135 }
1136 return c;
1137}
1138
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001139/* Get next token, after space stripping etc. */
1140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001141static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001142tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001143{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001144 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001148 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 tok->start = NULL;
1150 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001151
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001152 /* Get indentation level */
1153 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001154 int col = 0;
1155 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001156 tok->atbol = 0;
1157 for (;;) {
1158 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001159 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001161 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001163 col = (col / tok->tabsize + 1) * tok->tabsize;
1164 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 }
Brett Cannona721aba2016-09-09 14:57:09 -07001166 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001168 }
1169 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001171 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 }
1173 tok_backup(tok, c);
1174 if (c == '#' || c == '\n') {
1175 /* Lines with only whitespace and/or comments
1176 shouldn't affect the indentation and are
1177 not passed to the parser as NEWLINE tokens,
1178 except *totally* empty lines in interactive
1179 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001180 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001182 }
1183 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001184 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001185 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 /* We can't jump back right here since we still
1187 may need to skip to the end of a comment */
1188 }
1189 if (!blankline && tok->level == 0) {
1190 if (col == tok->indstack[tok->indent]) {
1191 /* No change */
1192 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001193 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001194 }
1195 }
1196 else if (col > tok->indstack[tok->indent]) {
1197 /* Indent -- always one */
1198 if (tok->indent+1 >= MAXINDENT) {
1199 tok->done = E_TOODEEP;
1200 tok->cur = tok->inp;
1201 return ERRORTOKEN;
1202 }
1203 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001204 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 }
1206 tok->pendin++;
1207 tok->indstack[++tok->indent] = col;
1208 tok->altindstack[tok->indent] = altcol;
1209 }
1210 else /* col < tok->indstack[tok->indent] */ {
1211 /* Dedent -- any number, must be consistent */
1212 while (tok->indent > 0 &&
1213 col < tok->indstack[tok->indent]) {
1214 tok->pendin--;
1215 tok->indent--;
1216 }
1217 if (col != tok->indstack[tok->indent]) {
1218 tok->done = E_DEDENT;
1219 tok->cur = tok->inp;
1220 return ERRORTOKEN;
1221 }
1222 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001223 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 }
1225 }
1226 }
1227 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001228
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001230
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 /* Return pending indents/dedents */
1232 if (tok->pendin != 0) {
1233 if (tok->pendin < 0) {
1234 tok->pendin++;
1235 return DEDENT;
1236 }
1237 else {
1238 tok->pendin--;
1239 return INDENT;
1240 }
1241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001242
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001243 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 tok->start = NULL;
1245 /* Skip spaces */
1246 do {
1247 c = tok_nextc(tok);
1248 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001250 /* Set start of current token */
1251 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001252
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001253 /* Skip comment, unless it's a type comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001254 if (c == '#') {
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001255 const char *prefix, *p, *type_start;
1256
Brett Cannona721aba2016-09-09 14:57:09 -07001257 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001259 }
Guido van Rossumdcfcd142019-01-31 03:40:27 -08001260
1261 if (tok->type_comments) {
1262 p = tok->start;
1263 prefix = type_comment_prefix;
1264 while (*prefix && p < tok->cur) {
1265 if (*prefix == ' ') {
1266 while (*p == ' ' || *p == '\t') {
1267 p++;
1268 }
1269 } else if (*prefix == *p) {
1270 p++;
1271 } else {
1272 break;
1273 }
1274
1275 prefix++;
1276 }
1277
1278 /* This is a type comment if we matched all of type_comment_prefix. */
1279 if (!*prefix) {
1280 int is_type_ignore = 1;
1281 tok_backup(tok, c); /* don't eat the newline or EOF */
1282
1283 type_start = p;
1284
1285 is_type_ignore = tok->cur >= p + 6 && memcmp(p, "ignore", 6) == 0;
1286 p += 6;
1287 while (is_type_ignore && p < tok->cur) {
1288 if (*p == '#')
1289 break;
1290 is_type_ignore = is_type_ignore && (*p == ' ' || *p == '\t');
1291 p++;
1292 }
1293
1294 if (is_type_ignore) {
1295 /* If this type ignore is the only thing on the line, consume the newline also. */
1296 if (blankline) {
1297 tok_nextc(tok);
1298 tok->atbol = 1;
1299 }
1300 return TYPE_IGNORE;
1301 } else {
1302 *p_start = (char *) type_start; /* after type_comment_prefix */
1303 *p_end = tok->cur;
1304 return TYPE_COMMENT;
1305 }
1306 }
1307 }
Brett Cannona721aba2016-09-09 14:57:09 -07001308 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001309
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 /* Check for EOF and errors now */
1311 if (c == EOF) {
1312 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1313 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001314
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 /* Identifier (most frequent token!) */
1316 nonascii = 0;
1317 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001318 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001319 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001320 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001321 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001322 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001323 /* Since this is a backwards compatibility support literal we don't
1324 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001325 else if (!(saw_b || saw_u || saw_r || saw_f)
1326 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001327 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001328 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001329 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001330 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001331 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001332 }
1333 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001334 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001335 }
1336 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001337 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001338 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001340 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001342 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 }
1344 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001345 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001347 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 c = tok_nextc(tok);
1349 }
1350 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001351 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001353 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354 *p_start = tok->start;
1355 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001356
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 return NAME;
1358 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 /* Newline */
1361 if (c == '\n') {
1362 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001363 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001364 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001365 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 *p_start = tok->start;
1367 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1368 tok->cont_line = 0;
1369 return NEWLINE;
1370 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372 /* Period or number starting with period? */
1373 if (c == '.') {
1374 c = tok_nextc(tok);
1375 if (isdigit(c)) {
1376 goto fraction;
1377 } else if (c == '.') {
1378 c = tok_nextc(tok);
1379 if (c == '.') {
1380 *p_start = tok->start;
1381 *p_end = tok->cur;
1382 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001383 }
1384 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 tok_backup(tok, c);
1386 }
1387 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001388 }
1389 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 tok_backup(tok, c);
1391 }
1392 *p_start = tok->start;
1393 *p_end = tok->cur;
1394 return DOT;
1395 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 /* Number */
1398 if (isdigit(c)) {
1399 if (c == '0') {
1400 /* Hex, octal or binary -- maybe. */
1401 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 /* Hex */
1404 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001405 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001406 if (c == '_') {
1407 c = tok_nextc(tok);
1408 }
1409 if (!isxdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001410 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001411 return syntaxerror(tok, "invalid hexadecimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001412 }
1413 do {
1414 c = tok_nextc(tok);
1415 } while (isxdigit(c));
1416 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 }
1418 else if (c == 'o' || c == 'O') {
1419 /* Octal */
1420 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001421 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001422 if (c == '_') {
1423 c = tok_nextc(tok);
1424 }
1425 if (c < '0' || c >= '8') {
Brett Cannona721aba2016-09-09 14:57:09 -07001426 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001427 if (isdigit(c)) {
1428 return syntaxerror(tok,
1429 "invalid digit '%c' in octal literal", c);
1430 }
1431 else {
1432 return syntaxerror(tok, "invalid octal literal");
1433 }
Brett Cannona721aba2016-09-09 14:57:09 -07001434 }
1435 do {
1436 c = tok_nextc(tok);
1437 } while ('0' <= c && c < '8');
1438 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001439 if (isdigit(c)) {
1440 return syntaxerror(tok,
1441 "invalid digit '%c' in octal literal", c);
1442 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 }
1444 else if (c == 'b' || c == 'B') {
1445 /* Binary */
1446 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001448 if (c == '_') {
1449 c = tok_nextc(tok);
1450 }
1451 if (c != '0' && c != '1') {
Brett Cannona721aba2016-09-09 14:57:09 -07001452 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001453 if (isdigit(c)) {
1454 return syntaxerror(tok,
1455 "invalid digit '%c' in binary literal", c);
1456 }
1457 else {
1458 return syntaxerror(tok, "invalid binary literal");
1459 }
Brett Cannona721aba2016-09-09 14:57:09 -07001460 }
1461 do {
1462 c = tok_nextc(tok);
1463 } while (c == '0' || c == '1');
1464 } while (c == '_');
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001465 if (isdigit(c)) {
1466 return syntaxerror(tok,
1467 "invalid digit '%c' in binary literal", c);
1468 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 }
1470 else {
1471 int nonzero = 0;
1472 /* maybe old-style octal; c is first char of it */
1473 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001474 while (1) {
1475 if (c == '_') {
1476 c = tok_nextc(tok);
1477 if (!isdigit(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001478 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001479 return syntaxerror(tok, "invalid decimal literal");
Brett Cannona721aba2016-09-09 14:57:09 -07001480 }
1481 }
1482 if (c != '0') {
1483 break;
1484 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 c = tok_nextc(tok);
1486 }
Brett Cannona721aba2016-09-09 14:57:09 -07001487 if (isdigit(c)) {
1488 nonzero = 1;
1489 c = tok_decimal_tail(tok);
1490 if (c == 0) {
1491 return ERRORTOKEN;
1492 }
1493 }
1494 if (c == '.') {
1495 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001497 }
1498 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001500 }
1501 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001502 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001503 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001505 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001507 return syntaxerror(tok,
1508 "leading zeros in decimal integer "
1509 "literals are not permitted; "
1510 "use an 0o prefix for octal integers");
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001511 }
1512 }
1513 }
1514 else {
1515 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001516 c = tok_decimal_tail(tok);
1517 if (c == 0) {
1518 return ERRORTOKEN;
1519 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 {
1521 /* Accept floating point numbers. */
1522 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001523 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 fraction:
1525 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001526 if (isdigit(c)) {
1527 c = tok_decimal_tail(tok);
1528 if (c == 0) {
1529 return ERRORTOKEN;
1530 }
1531 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 }
1533 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001534 int e;
1535 exponent:
1536 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 /* Exponent part */
1538 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001539 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001540 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001541 if (!isdigit(c)) {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001542 tok_backup(tok, c);
Serhiy Storchakacf7303e2018-07-09 15:09:35 +03001543 return syntaxerror(tok, "invalid decimal literal");
Benjamin Petersonc4161622014-06-07 12:36:39 -07001544 }
1545 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001547 tok_backup(tok, e);
1548 *p_start = tok->start;
1549 *p_end = tok->cur;
1550 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 }
Brett Cannona721aba2016-09-09 14:57:09 -07001552 c = tok_decimal_tail(tok);
1553 if (c == 0) {
1554 return ERRORTOKEN;
1555 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001556 }
Brett Cannona721aba2016-09-09 14:57:09 -07001557 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 /* Imaginary part */
1559 imaginary:
1560 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001561 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001562 }
1563 }
1564 tok_backup(tok, c);
1565 *p_start = tok->start;
1566 *p_end = tok->cur;
1567 return NUMBER;
1568 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001569
1570 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001571 /* String */
1572 if (c == '\'' || c == '"') {
1573 int quote = c;
1574 int quote_size = 1; /* 1 or 3 */
1575 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001576
Anthony Sottile995d9b92019-01-12 20:05:13 -08001577 /* Nodes of type STRING, especially multi line strings
1578 must be handled differently in order to get both
1579 the starting line number and the column offset right.
1580 (cf. issue 16806) */
1581 tok->first_lineno = tok->lineno;
1582 tok->multi_line_start = tok->line_start;
1583
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 /* Find the quote size and start of string */
1585 c = tok_nextc(tok);
1586 if (c == quote) {
1587 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001588 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001590 }
1591 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001593 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 }
Brett Cannona721aba2016-09-09 14:57:09 -07001595 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001597 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001598
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001599 /* Get rest of string */
1600 while (end_quote_size != quote_size) {
1601 c = tok_nextc(tok);
1602 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001603 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001605 }
1606 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001608 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 tok->cur = tok->inp;
1610 return ERRORTOKEN;
1611 }
1612 if (quote_size == 1 && c == '\n') {
1613 tok->done = E_EOLS;
1614 tok->cur = tok->inp;
1615 return ERRORTOKEN;
1616 }
Brett Cannona721aba2016-09-09 14:57:09 -07001617 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001619 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 else {
1621 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001622 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001623 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001624 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 }
1626 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001627
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 *p_start = tok->start;
1629 *p_end = tok->cur;
1630 return STRING;
1631 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001632
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 /* Line continuation */
1634 if (c == '\\') {
1635 c = tok_nextc(tok);
1636 if (c != '\n') {
1637 tok->done = E_LINECONT;
1638 tok->cur = tok->inp;
1639 return ERRORTOKEN;
1640 }
1641 tok->cont_line = 1;
1642 goto again; /* Read next line */
1643 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001644
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 /* Check for two-character token */
1646 {
1647 int c2 = tok_nextc(tok);
1648 int token = PyToken_TwoChars(c, c2);
1649 if (token != OP) {
1650 int c3 = tok_nextc(tok);
1651 int token3 = PyToken_ThreeChars(c, c2, c3);
1652 if (token3 != OP) {
1653 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001654 }
1655 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001656 tok_backup(tok, c3);
1657 }
1658 *p_start = tok->start;
1659 *p_end = tok->cur;
1660 return token;
1661 }
1662 tok_backup(tok, c2);
1663 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001664
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 /* Keep track of parentheses nesting level */
1666 switch (c) {
1667 case '(':
1668 case '[':
1669 case '{':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001670#ifndef PGEN
1671 if (tok->level >= MAXLEVEL) {
1672 return syntaxerror(tok, "too many nested parentheses");
1673 }
1674 tok->parenstack[tok->level] = c;
1675 tok->parenlinenostack[tok->level] = tok->lineno;
1676#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 tok->level++;
1678 break;
1679 case ')':
1680 case ']':
1681 case '}':
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001682#ifndef PGEN
1683 if (!tok->level) {
1684 return syntaxerror(tok, "unmatched '%c'", c);
1685 }
1686#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 tok->level--;
Serhiy Storchaka94cf3082018-12-17 17:34:14 +02001688#ifndef PGEN
1689 int opening = tok->parenstack[tok->level];
1690 if (!((opening == '(' && c == ')') ||
1691 (opening == '[' && c == ']') ||
1692 (opening == '{' && c == '}')))
1693 {
1694 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1695 return syntaxerror(tok,
1696 "closing parenthesis '%c' does not match "
1697 "opening parenthesis '%c' on line %d",
1698 c, opening, tok->parenlinenostack[tok->level]);
1699 }
1700 else {
1701 return syntaxerror(tok,
1702 "closing parenthesis '%c' does not match "
1703 "opening parenthesis '%c'",
1704 c, opening);
1705 }
1706 }
1707#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 break;
1709 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001710
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 /* Punctuation character */
1712 *p_start = tok->start;
1713 *p_end = tok->cur;
1714 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001715}
1716
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001717int
1718PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1719{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 int result = tok_get(tok, p_start, p_end);
1721 if (tok->decoding_erred) {
1722 result = ERRORTOKEN;
1723 tok->done = E_DECODE;
1724 }
1725 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001726}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001727
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001728/* Get the encoding of a Python file. Check for the coding cookie and check if
1729 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001730
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001731 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1732 encoding in the first or second line of the file (in which case the encoding
1733 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001734
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001735 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1736 by the caller. */
1737
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001738char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001739PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001740{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001741 struct tok_state *tok;
1742 FILE *fp;
1743 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001744
Victor Stinnerdaf45552013-08-28 00:53:59 +02001745#ifndef PGEN
1746 fd = _Py_dup(fd);
1747#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001749#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750 if (fd < 0) {
1751 return NULL;
1752 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001753
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 fp = fdopen(fd, "r");
1755 if (fp == NULL) {
1756 return NULL;
1757 }
1758 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1759 if (tok == NULL) {
1760 fclose(fp);
1761 return NULL;
1762 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001763#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001764 if (filename != NULL) {
1765 Py_INCREF(filename);
1766 tok->filename = filename;
1767 }
1768 else {
1769 tok->filename = PyUnicode_FromString("<string>");
1770 if (tok->filename == NULL) {
1771 fclose(fp);
1772 PyTokenizer_Free(tok);
1773 return encoding;
1774 }
1775 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001776#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 while (tok->lineno < 2 && tok->done == E_OK) {
1778 PyTokenizer_Get(tok, &p_start, &p_end);
1779 }
1780 fclose(fp);
1781 if (tok->encoding) {
1782 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1783 if (encoding)
1784 strcpy(encoding, tok->encoding);
1785 }
1786 PyTokenizer_Free(tok);
1787 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001788}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001789
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001790char *
1791PyTokenizer_FindEncoding(int fd)
1792{
1793 return PyTokenizer_FindEncodingFilename(fd, NULL);
1794}
1795
Guido van Rossum408027e1996-12-30 16:17:54 +00001796#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001797
1798void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001799tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001800{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001801 printf("%s", _PyParser_TokenNames[type]);
1802 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1803 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001804}
1805
1806#endif