blob: e374c5a4aee615bd8bd7653b35bcf0f1b21c61d0 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -080021/* Alternate tab spacing */
22#define ALTTABSIZE 1
23
Martin v. Löwis5b222132007-06-10 09:51:05 +000024#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000025 (c >= 'a' && c <= 'z')\
26 || (c >= 'A' && c <= 'Z')\
27 || c == '_'\
28 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000029
30#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 (c >= 'a' && c <= 'z')\
32 || (c >= 'A' && c <= 'Z')\
33 || (c >= '0' && c <= '9')\
34 || c == '_'\
35 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000036
Serhiy Storchakac6792272013-10-19 21:03:34 +030037extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000038/* Return malloc'ed string including trailing \n;
39 empty malloc'ed string for EOF;
40 NULL if interrupted */
41
Guido van Rossum4fe87291992-02-26 15:24:44 +000042/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000043#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000044
Guido van Rossum3f5da241990-12-20 15:06:42 +000045/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000046static struct tok_state *tok_new(void);
47static int tok_nextc(struct tok_state *tok);
48static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000049
Brett Cannond5ec98c2007-10-20 02:54:14 +000050
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051/* Token names */
52
Benjamin Petersond0845582012-10-24 08:21:52 -070053const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000054 "ENDMARKER",
55 "NAME",
56 "NUMBER",
57 "STRING",
58 "NEWLINE",
59 "INDENT",
60 "DEDENT",
61 "LPAR",
62 "RPAR",
63 "LSQB",
64 "RSQB",
65 "COLON",
66 "COMMA",
67 "SEMI",
68 "PLUS",
69 "MINUS",
70 "STAR",
71 "SLASH",
72 "VBAR",
73 "AMPER",
74 "LESS",
75 "GREATER",
76 "EQUAL",
77 "DOT",
78 "PERCENT",
79 "LBRACE",
80 "RBRACE",
81 "EQEQUAL",
82 "NOTEQUAL",
83 "LESSEQUAL",
84 "GREATEREQUAL",
85 "TILDE",
86 "CIRCUMFLEX",
87 "LEFTSHIFT",
88 "RIGHTSHIFT",
89 "DOUBLESTAR",
90 "PLUSEQUAL",
91 "MINEQUAL",
92 "STAREQUAL",
93 "SLASHEQUAL",
94 "PERCENTEQUAL",
95 "AMPEREQUAL",
96 "VBAREQUAL",
97 "CIRCUMFLEXEQUAL",
98 "LEFTSHIFTEQUAL",
99 "RIGHTSHIFTEQUAL",
100 "DOUBLESTAREQUAL",
101 "DOUBLESLASH",
102 "DOUBLESLASHEQUAL",
103 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400104 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000105 "RARROW",
106 "ELLIPSIS",
107 /* This table must match the #defines in token.h! */
108 "OP",
109 "<ERRORTOKEN>",
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +0100110 "COMMENT",
111 "NL",
Albert-Jan Nijburgc9ccace2017-06-01 21:51:27 +0100112 "ENCODING",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114};
115
116
117/* Create and initialize a new tok_state structure */
118
119static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000120tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000121{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000122 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
123 sizeof(struct tok_state));
124 if (tok == NULL)
125 return NULL;
126 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
127 tok->done = E_OK;
128 tok->fp = NULL;
129 tok->input = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400133
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 tok->atbol = 1;
135 tok->pendin = 0;
136 tok->prompt = tok->nextprompt = NULL;
137 tok->lineno = 0;
138 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 tok->altindstack[0] = 0;
140 tok->decoding_state = STATE_INIT;
141 tok->decoding_erred = 0;
142 tok->read_coding_spec = 0;
143 tok->enc = NULL;
144 tok->encoding = NULL;
145 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000146#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200147 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 tok->decoding_readline = NULL;
149 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000150#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300151
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000153}
154
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000155static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700156new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000157{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159 if (!result) {
160 tok->done = E_NOMEM;
161 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700163 memcpy(result, s, len);
164 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000166}
167
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168#ifdef PGEN
169
170static char *
171decoding_fgets(char *s, int size, struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
176static int
177decoding_feof(struct tok_state *tok)
178{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000182static char *
183decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000184{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700185 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186}
187
188#else /* PGEN */
189
190static char *
191error_ret(struct tok_state *tok) /* XXX */
192{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 tok->decoding_erred = 1;
194 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
195 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200196 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
197 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000199}
200
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000201
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200202static const char *
203get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000205 char buf[13];
206 int i;
207 for (i = 0; i < 12; i++) {
208 int c = s[i];
209 if (c == '\0')
210 break;
211 else if (c == '_')
212 buf[i] = '-';
213 else
214 buf[i] = tolower(c);
215 }
216 buf[i] = '\0';
217 if (strcmp(buf, "utf-8") == 0 ||
218 strncmp(buf, "utf-8-", 6) == 0)
219 return "utf-8";
220 else if (strcmp(buf, "latin-1") == 0 ||
221 strcmp(buf, "iso-8859-1") == 0 ||
222 strcmp(buf, "iso-latin-1") == 0 ||
223 strncmp(buf, "latin-1-", 8) == 0 ||
224 strncmp(buf, "iso-8859-1-", 11) == 0 ||
225 strncmp(buf, "iso-latin-1-", 12) == 0)
226 return "iso-8859-1";
227 else
228 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000229}
230
231/* Return the coding spec in S, or NULL if none is found. */
232
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700233static int
234get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 /* Coding spec must be in a comment, and that comment must be
239 * the only statement on the source code line. */
240 for (i = 0; i < size - 6; i++) {
241 if (s[i] == '#')
242 break;
243 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700244 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 }
246 for (; i < size - 6; i++) { /* XXX inefficient search */
247 const char* t = s + i;
248 if (strncmp(t, "coding", 6) == 0) {
249 const char* begin = NULL;
250 t += 6;
251 if (t[0] != ':' && t[0] != '=')
252 continue;
253 do {
254 t++;
255 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 begin = t;
258 while (Py_ISALNUM(t[0]) ||
259 t[0] == '-' || t[0] == '_' || t[0] == '.')
260 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200264 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700265 if (!r)
266 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700267 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (r != q) {
269 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700270 r = new_string(q, strlen(q), tok);
271 if (!r)
272 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700274 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200275 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 }
277 }
278 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700279 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280}
281
282/* Check whether the line contains a coding spec. If it does,
283 invoke the set_readline function for the new encoding.
284 This function receives the tok_state and the new encoding.
285 Return 1 on success, 0 on failure. */
286
287static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000288check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000293
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200294 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000295 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200296 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000297 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200298 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700299 if (!get_coding_spec(line, &cs, size, tok))
300 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200301 if (!cs) {
302 Py_ssize_t i;
303 for (i = 0; i < size; i++) {
304 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
305 break;
306 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
307 /* Stop checking coding spec after a line containing
308 * anything except a comment. */
309 tok->read_coding_spec = 1;
310 break;
311 }
312 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700313 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200314 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700315 tok->read_coding_spec = 1;
316 if (tok->encoding == NULL) {
317 assert(tok->decoding_state == STATE_RAW);
318 if (strcmp(cs, "utf-8") == 0) {
319 tok->encoding = cs;
320 } else {
321 r = set_readline(tok, cs);
322 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700326 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300327 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700328 "encoding problem: %s", cs);
329 PyMem_FREE(cs);
330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700332 } else { /* then, compare cs with BOM */
333 r = (strcmp(tok->encoding, cs) == 0);
334 if (!r)
335 PyErr_Format(PyExc_SyntaxError,
336 "encoding problem: %s with BOM", cs);
337 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000339 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340}
341
342/* See whether the file starts with a BOM. If it does,
343 invoke the set_readline function with the new encoding.
344 Return 1 on success, 0 on failure. */
345
346static int
347check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 void unget_char(int, struct tok_state *),
349 int set_readline(struct tok_state *, const char *),
350 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 int ch1, ch2, ch3;
353 ch1 = get_char(tok);
354 tok->decoding_state = STATE_RAW;
355 if (ch1 == EOF) {
356 return 1;
357 } else if (ch1 == 0xEF) {
358 ch2 = get_char(tok);
359 if (ch2 != 0xBB) {
360 unget_char(ch2, tok);
361 unget_char(ch1, tok);
362 return 1;
363 }
364 ch3 = get_char(tok);
365 if (ch3 != 0xBF) {
366 unget_char(ch3, tok);
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 /* Disable support for UTF-16 BOMs until a decision
373 is made whether this needs to be supported. */
374 } else if (ch1 == 0xFE) {
375 ch2 = get_char(tok);
376 if (ch2 != 0xFF) {
377 unget_char(ch2, tok);
378 unget_char(ch1, tok);
379 return 1;
380 }
381 if (!set_readline(tok, "utf-16-be"))
382 return 0;
383 tok->decoding_state = STATE_NORMAL;
384 } else if (ch1 == 0xFF) {
385 ch2 = get_char(tok);
386 if (ch2 != 0xFE) {
387 unget_char(ch2, tok);
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (!set_readline(tok, "utf-16-le"))
392 return 0;
393 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 } else {
396 unget_char(ch1, tok);
397 return 1;
398 }
399 if (tok->encoding != NULL)
400 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700401 tok->encoding = new_string("utf-8", 5, tok);
402 if (!tok->encoding)
403 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 /* No need to set_readline: input is already utf-8 */
405 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000406}
407
408/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000410
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411 On entry, tok->decoding_buffer will be one of:
412 1) NULL: need to call tok->decoding_readline to get a new line
413 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000414 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000415 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000416 (in the s buffer) to copy entire contents of the line read
417 by tok->decoding_readline. tok->decoding_buffer has the overflow.
418 In this case, fp_readl is called in a loop (with an expanded buffer)
419 until the buffer ends with a '\n' (or until the end of the file is
420 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000422
423static char *
424fp_readl(char *s, int size, struct tok_state *tok)
425{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyObject* bufobj;
427 const char *buf;
428 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000429
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 /* Ask for one less byte so we can terminate it */
431 assert(size > 0);
432 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000434 if (tok->decoding_buffer) {
435 bufobj = tok->decoding_buffer;
436 Py_INCREF(bufobj);
437 }
438 else
439 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100440 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 if (bufobj == NULL)
442 goto error;
443 }
444 if (PyUnicode_CheckExact(bufobj))
445 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200446 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 if (buf == NULL) {
448 goto error;
449 }
450 }
451 else
452 {
453 buf = PyByteArray_AsString(bufobj);
454 if (buf == NULL) {
455 goto error;
456 }
457 buflen = PyByteArray_GET_SIZE(bufobj);
458 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000460 Py_XDECREF(tok->decoding_buffer);
461 if (buflen > size) {
462 /* Too many chars, the rest goes into tok->decoding_buffer */
463 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
464 buflen-size);
465 if (tok->decoding_buffer == NULL)
466 goto error;
467 buflen = size;
468 }
469 else
470 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000471
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 memcpy(s, buf, buflen);
473 s[buflen] = '\0';
474 if (buflen == 0) /* EOF */
475 s = NULL;
476 Py_DECREF(bufobj);
477 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000478
479error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 Py_XDECREF(bufobj);
481 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482}
483
484/* Set the readline function for TOK to a StreamReader's
485 readline function. The StreamReader is named ENC.
486
487 This function is called from check_bom and check_coding_spec.
488
489 ENC is usually identical to the future value of tok->encoding,
490 except for the (currently unsupported) case of UTF-16.
491
492 Return 1 on success, 0 on failure. */
493
494static int
495fp_setreadl(struct tok_state *tok, const char* enc)
496{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700497 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200498 _Py_IDENTIFIER(open);
499 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000500 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200501 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000502
Victor Stinner22a351a2010-10-14 12:04:34 +0000503 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200504 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100505 * position of tok->fp. If tok->fp was opened in text mode on Windows,
506 * its file position counts CRLF as one char and can't be directly mapped
507 * to the file offset for fd. Instead we step back one byte and read to
508 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200509 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100510 if (pos == -1 ||
511 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000512 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700513 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000514 }
515
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700516 io = PyImport_ImportModuleNoBlock("io");
517 if (io == NULL)
518 return 0;
519
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200520 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000521 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700522 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000523 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700524 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000525
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200526 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700527 Py_DECREF(stream);
528 if (readline == NULL)
529 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300530 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700531
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100532 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100533 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700534 if (bufobj == NULL)
535 return 0;
536 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100537 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000538
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700539 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540}
541
542/* Fetch the next byte from TOK. */
543
544static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546}
547
548/* Unfetch the last byte back into TOK. */
549
550static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000554/* Check whether the characters at s start a valid
555 UTF-8 sequence. Return the number of characters forming
556 the sequence if yes, 0 if not. */
557static int valid_utf8(const unsigned char* s)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 int expected = 0;
560 int length;
561 if (*s < 0x80)
562 /* single-byte code */
563 return 1;
564 if (*s < 0xc0)
565 /* following byte */
566 return 0;
567 if (*s < 0xE0)
568 expected = 1;
569 else if (*s < 0xF0)
570 expected = 2;
571 else if (*s < 0xF8)
572 expected = 3;
573 else
574 return 0;
575 length = expected + 1;
576 for (; expected; expected--)
577 if (s[expected] < 0x80 || s[expected] >= 0xC0)
578 return 0;
579 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000580}
581
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582/* Read a line of input from TOK. Determine encoding
583 if necessary. */
584
585static char *
586decoding_fgets(char *s, int size, struct tok_state *tok)
587{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 char *line = NULL;
589 int badchar = 0;
590 for (;;) {
591 if (tok->decoding_state == STATE_NORMAL) {
592 /* We already have a codec associated with
593 this input. */
594 line = fp_readl(s, size, tok);
595 break;
596 } else if (tok->decoding_state == STATE_RAW) {
597 /* We want a 'raw' read. */
598 line = Py_UniversalNewlineFgets(s, size,
599 tok->fp, NULL);
600 break;
601 } else {
602 /* We have not yet determined the encoding.
603 If an encoding is found, use the file-pointer
604 reader functions from now on. */
605 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
606 return error_ret(tok);
607 assert(tok->decoding_state != STATE_INIT);
608 }
609 }
610 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
611 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
612 return error_ret(tok);
613 }
614 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 /* The default encoding is UTF-8, so make sure we don't have any
617 non-UTF-8 sequences in it. */
618 if (line && !tok->encoding) {
619 unsigned char *c;
620 int length;
621 for (c = (unsigned char *)line; *c; c += length)
622 if (!(length = valid_utf8(c))) {
623 badchar = *c;
624 break;
625 }
626 }
627 if (badchar) {
628 /* Need to add 1 to the line number, since this line
629 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200630 PyErr_Format(PyExc_SyntaxError,
631 "Non-UTF-8 code starting with '\\x%.2x' "
632 "in file %U on line %i, "
633 "but no encoding declared; "
634 "see http://python.org/dev/peps/pep-0263/ for details",
635 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 return error_ret(tok);
637 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640}
641
642static int
643decoding_feof(struct tok_state *tok)
644{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 if (tok->decoding_state != STATE_NORMAL) {
646 return feof(tok->fp);
647 } else {
648 PyObject* buf = tok->decoding_buffer;
649 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100650 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 if (buf == NULL) {
652 error_ret(tok);
653 return 1;
654 } else {
655 tok->decoding_buffer = buf;
656 }
657 }
658 return PyObject_Length(buf) == 0;
659 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660}
661
662/* Fetch a byte from TOK, using the string buffer. */
663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000664static int
665buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667}
668
669/* Unfetch a byte from TOK, using the string buffer. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static void
672buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->str--;
674 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Set the readline function for TOK to ENC. For the string-based
678 tokenizer, this means to just record the encoding. */
679
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000680static int
681buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 tok->enc = enc;
683 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684}
685
686/* Return a UTF-8 encoding Python string object from the
687 C byte string STR, which is encoded with ENC. */
688
689static PyObject *
690translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *utf8;
692 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
693 if (buf == NULL)
694 return NULL;
695 utf8 = PyUnicode_AsUTF8String(buf);
696 Py_DECREF(buf);
697 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698}
699
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000700
701static char *
702translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200703 int skip_next_lf = 0;
704 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 char *buf, *current;
706 char c = '\0';
707 buf = PyMem_MALLOC(needed_length);
708 if (buf == NULL) {
709 tok->done = E_NOMEM;
710 return NULL;
711 }
712 for (current = buf; *s; s++, current++) {
713 c = *s;
714 if (skip_next_lf) {
715 skip_next_lf = 0;
716 if (c == '\n') {
717 c = *++s;
718 if (!c)
719 break;
720 }
721 }
722 if (c == '\r') {
723 skip_next_lf = 1;
724 c = '\n';
725 }
726 *current = c;
727 }
728 /* If this is exec input, add a newline to the end of the string if
729 there isn't one already. */
730 if (exec_input && c != '\n') {
731 *current = '\n';
732 current++;
733 }
734 *current = '\0';
735 final_length = current - buf + 1;
Victor Stinner65b98492019-03-20 13:03:11 +0100736 if (final_length < needed_length && final_length) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 /* should never fail */
Victor Stinner65b98492019-03-20 13:03:11 +0100738 char* result = PyMem_REALLOC(buf, final_length);
739 if (result == NULL) {
740 PyMem_FREE(buf);
741 }
742 buf = result;
743 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000745}
746
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000747/* Decode a byte string STR for use as the buffer of TOK.
748 Look for encoding declarations inside STR, and record them
749 inside TOK. */
750
751static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000752decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000753{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 PyObject* utf8 = NULL;
755 const char *str;
756 const char *s;
757 const char *newl[2] = {NULL, NULL};
758 int lineno = 0;
759 tok->input = str = translate_newlines(input, single, tok);
760 if (str == NULL)
761 return NULL;
762 tok->enc = NULL;
763 tok->str = str;
764 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
765 return error_ret(tok);
766 str = tok->str; /* string after BOM if any */
767 assert(str);
768 if (tok->enc != NULL) {
769 utf8 = translate_into_utf8(str, tok->enc);
770 if (utf8 == NULL)
771 return error_ret(tok);
772 str = PyBytes_AsString(utf8);
773 }
774 for (s = str;; s++) {
775 if (*s == '\0') break;
776 else if (*s == '\n') {
777 assert(lineno < 2);
778 newl[lineno] = s;
779 lineno++;
780 if (lineno == 2) break;
781 }
782 }
783 tok->enc = NULL;
784 /* need to check line 1 and 2 separately since check_coding_spec
785 assumes a single line as input */
786 if (newl[0]) {
787 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
788 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200789 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
791 tok, buf_setreadl))
792 return error_ret(tok);
793 }
794 }
795 if (tok->enc != NULL) {
796 assert(utf8 == NULL);
797 utf8 = translate_into_utf8(str, tok->enc);
798 if (utf8 == NULL)
799 return error_ret(tok);
800 str = PyBytes_AS_STRING(utf8);
801 }
802 assert(tok->decoding_buffer == NULL);
803 tok->decoding_buffer = utf8; /* CAUTION */
804 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000805}
806
807#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808
809/* Set up tokenizer for string */
810
811struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000812PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 struct tok_state *tok = tok_new();
815 if (tok == NULL)
816 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300817 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 if (str == NULL) {
819 PyTokenizer_Free(tok);
820 return NULL;
821 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000822
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 /* XXX: constify members. */
824 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
825 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826}
827
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000828struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000829PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000830{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 struct tok_state *tok = tok_new();
832 if (tok == NULL)
833 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000834#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000836#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 if (str == NULL) {
838 PyTokenizer_Free(tok);
839 return NULL;
840 }
841 tok->decoding_state = STATE_RAW;
842 tok->read_coding_spec = 1;
843 tok->enc = NULL;
844 tok->str = str;
845 tok->encoding = (char *)PyMem_MALLOC(6);
846 if (!tok->encoding) {
847 PyTokenizer_Free(tok);
848 return NULL;
849 }
850 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000851
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 /* XXX: constify members. */
853 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
854 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000855}
856
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000857/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858
859struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300860PyTokenizer_FromFile(FILE *fp, const char* enc,
861 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000863 struct tok_state *tok = tok_new();
864 if (tok == NULL)
865 return NULL;
866 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
867 PyTokenizer_Free(tok);
868 return NULL;
869 }
870 tok->cur = tok->inp = tok->buf;
871 tok->end = tok->buf + BUFSIZ;
872 tok->fp = fp;
873 tok->prompt = ps1;
874 tok->nextprompt = ps2;
875 if (enc != NULL) {
876 /* Must copy encoding declaration since it
877 gets copied into the parse tree. */
878 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
879 if (!tok->encoding) {
880 PyTokenizer_Free(tok);
881 return NULL;
882 }
883 strcpy(tok->encoding, enc);
884 tok->decoding_state = STATE_NORMAL;
885 }
886 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887}
888
889
890/* Free a tok_state structure */
891
892void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000893PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000894{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000895 if (tok->encoding != NULL)
896 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000897#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 Py_XDECREF(tok->decoding_readline);
899 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200900 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000901#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 if (tok->fp != NULL && tok->buf != NULL)
903 PyMem_FREE(tok->buf);
904 if (tok->input)
905 PyMem_FREE((char *)tok->input);
906 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907}
908
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909/* Get next char, updating state; error code goes into tok->done */
910
911static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200912tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 for (;;) {
915 if (tok->cur != tok->inp) {
916 return Py_CHARMASK(*tok->cur++); /* Fast path */
917 }
918 if (tok->done != E_OK)
919 return EOF;
920 if (tok->fp == NULL) {
921 char *end = strchr(tok->inp, '\n');
922 if (end != NULL)
923 end++;
924 else {
925 end = strchr(tok->inp, '\0');
926 if (end == tok->inp) {
927 tok->done = E_EOF;
928 return EOF;
929 }
930 }
931 if (tok->start == NULL)
932 tok->buf = tok->cur;
933 tok->line_start = tok->cur;
934 tok->lineno++;
935 tok->inp = end;
936 return Py_CHARMASK(*tok->cur++);
937 }
938 if (tok->prompt != NULL) {
939 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000940#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000941 if (newtok != NULL) {
942 char *translated = translate_newlines(newtok, 0, tok);
943 PyMem_FREE(newtok);
944 if (translated == NULL)
945 return EOF;
946 newtok = translated;
947 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000948 if (tok->encoding && newtok && *newtok) {
949 /* Recode to UTF-8 */
950 Py_ssize_t buflen;
951 const char* buf;
952 PyObject *u = translate_into_utf8(newtok, tok->encoding);
953 PyMem_FREE(newtok);
954 if (!u) {
955 tok->done = E_DECODE;
956 return EOF;
957 }
958 buflen = PyBytes_GET_SIZE(u);
959 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000960 newtok = PyMem_MALLOC(buflen+1);
Zackery Spytz602d3072018-12-07 05:17:43 -0700961 if (newtok == NULL) {
962 Py_DECREF(u);
963 tok->done = E_NOMEM;
964 return EOF;
965 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 strcpy(newtok, buf);
967 Py_DECREF(u);
968 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000969#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 if (tok->nextprompt != NULL)
971 tok->prompt = tok->nextprompt;
972 if (newtok == NULL)
973 tok->done = E_INTR;
974 else if (*newtok == '\0') {
975 PyMem_FREE(newtok);
976 tok->done = E_EOF;
977 }
978 else if (tok->start != NULL) {
979 size_t start = tok->start - tok->buf;
980 size_t oldlen = tok->cur - tok->buf;
981 size_t newlen = oldlen + strlen(newtok);
982 char *buf = tok->buf;
983 buf = (char *)PyMem_REALLOC(buf, newlen+1);
984 tok->lineno++;
985 if (buf == NULL) {
986 PyMem_FREE(tok->buf);
987 tok->buf = NULL;
988 PyMem_FREE(newtok);
989 tok->done = E_NOMEM;
990 return EOF;
991 }
992 tok->buf = buf;
993 tok->cur = tok->buf + oldlen;
994 tok->line_start = tok->cur;
995 strcpy(tok->buf + oldlen, newtok);
996 PyMem_FREE(newtok);
997 tok->inp = tok->buf + newlen;
998 tok->end = tok->inp + 1;
999 tok->start = tok->buf + start;
1000 }
1001 else {
1002 tok->lineno++;
1003 if (tok->buf != NULL)
1004 PyMem_FREE(tok->buf);
1005 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 tok->cur = tok->buf;
1007 tok->line_start = tok->buf;
1008 tok->inp = strchr(tok->buf, '\0');
1009 tok->end = tok->inp + 1;
1010 }
1011 }
1012 else {
1013 int done = 0;
1014 Py_ssize_t cur = 0;
1015 char *pt;
1016 if (tok->start == NULL) {
1017 if (tok->buf == NULL) {
1018 tok->buf = (char *)
1019 PyMem_MALLOC(BUFSIZ);
1020 if (tok->buf == NULL) {
1021 tok->done = E_NOMEM;
1022 return EOF;
1023 }
1024 tok->end = tok->buf + BUFSIZ;
1025 }
1026 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1027 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001028 if (!tok->decoding_erred)
1029 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 done = 1;
1031 }
1032 else {
1033 tok->done = E_OK;
1034 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -07001035 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001036 }
1037 }
1038 else {
1039 cur = tok->cur - tok->buf;
1040 if (decoding_feof(tok)) {
1041 tok->done = E_EOF;
1042 done = 1;
1043 }
1044 else
1045 tok->done = E_OK;
1046 }
1047 tok->lineno++;
1048 /* Read until '\n' or EOF */
1049 while (!done) {
1050 Py_ssize_t curstart = tok->start == NULL ? -1 :
1051 tok->start - tok->buf;
1052 Py_ssize_t curvalid = tok->inp - tok->buf;
1053 Py_ssize_t newsize = curvalid + BUFSIZ;
1054 char *newbuf = tok->buf;
1055 newbuf = (char *)PyMem_REALLOC(newbuf,
1056 newsize);
1057 if (newbuf == NULL) {
1058 tok->done = E_NOMEM;
1059 tok->cur = tok->inp;
1060 return EOF;
1061 }
1062 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001063 tok->cur = tok->buf + cur;
1064 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 tok->inp = tok->buf + curvalid;
1066 tok->end = tok->buf + newsize;
1067 tok->start = curstart < 0 ? NULL :
1068 tok->buf + curstart;
1069 if (decoding_fgets(tok->inp,
1070 (int)(tok->end - tok->inp),
1071 tok) == NULL) {
1072 /* Break out early on decoding
1073 errors, as tok->buf will be NULL
1074 */
1075 if (tok->decoding_erred)
1076 return EOF;
1077 /* Last line does not end in \n,
1078 fake one */
1079 strcpy(tok->inp, "\n");
1080 }
1081 tok->inp = strchr(tok->inp, '\0');
1082 done = tok->inp[-1] == '\n';
1083 }
1084 if (tok->buf != NULL) {
1085 tok->cur = tok->buf + cur;
1086 tok->line_start = tok->cur;
1087 /* replace "\r\n" with "\n" */
1088 /* For Mac leave the \r, giving a syntax error */
1089 pt = tok->inp - 2;
1090 if (pt >= tok->buf && *pt == '\r') {
1091 *pt++ = '\n';
1092 *pt = '\0';
1093 tok->inp = pt;
1094 }
1095 }
1096 }
1097 if (tok->done != E_OK) {
1098 if (tok->prompt != NULL)
1099 PySys_WriteStderr("\n");
1100 tok->cur = tok->inp;
1101 return EOF;
1102 }
1103 }
1104 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001105}
1106
1107
1108/* Back-up one character */
1109
1110static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001111tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001113 if (c != EOF) {
1114 if (--tok->cur < tok->buf)
1115 Py_FatalError("tok_backup: beginning of buffer");
1116 if (*tok->cur != c)
1117 *tok->cur = c;
1118 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119}
1120
1121
1122/* Return the token corresponding to a single character */
1123
1124int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001125PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001127 switch (c) {
1128 case '(': return LPAR;
1129 case ')': return RPAR;
1130 case '[': return LSQB;
1131 case ']': return RSQB;
1132 case ':': return COLON;
1133 case ',': return COMMA;
1134 case ';': return SEMI;
1135 case '+': return PLUS;
1136 case '-': return MINUS;
1137 case '*': return STAR;
1138 case '/': return SLASH;
1139 case '|': return VBAR;
1140 case '&': return AMPER;
1141 case '<': return LESS;
1142 case '>': return GREATER;
1143 case '=': return EQUAL;
1144 case '.': return DOT;
1145 case '%': return PERCENT;
1146 case '{': return LBRACE;
1147 case '}': return RBRACE;
1148 case '^': return CIRCUMFLEX;
1149 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001150 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 default: return OP;
1152 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153}
1154
1155
Guido van Rossumfbab9051991-10-20 20:25:03 +00001156int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001157PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001158{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001159 switch (c1) {
1160 case '=':
1161 switch (c2) {
1162 case '=': return EQEQUAL;
1163 }
1164 break;
1165 case '!':
1166 switch (c2) {
1167 case '=': return NOTEQUAL;
1168 }
1169 break;
1170 case '<':
1171 switch (c2) {
1172 case '>': return NOTEQUAL;
1173 case '=': return LESSEQUAL;
1174 case '<': return LEFTSHIFT;
1175 }
1176 break;
1177 case '>':
1178 switch (c2) {
1179 case '=': return GREATEREQUAL;
1180 case '>': return RIGHTSHIFT;
1181 }
1182 break;
1183 case '+':
1184 switch (c2) {
1185 case '=': return PLUSEQUAL;
1186 }
1187 break;
1188 case '-':
1189 switch (c2) {
1190 case '=': return MINEQUAL;
1191 case '>': return RARROW;
1192 }
1193 break;
1194 case '*':
1195 switch (c2) {
1196 case '*': return DOUBLESTAR;
1197 case '=': return STAREQUAL;
1198 }
1199 break;
1200 case '/':
1201 switch (c2) {
1202 case '/': return DOUBLESLASH;
1203 case '=': return SLASHEQUAL;
1204 }
1205 break;
1206 case '|':
1207 switch (c2) {
1208 case '=': return VBAREQUAL;
1209 }
1210 break;
1211 case '%':
1212 switch (c2) {
1213 case '=': return PERCENTEQUAL;
1214 }
1215 break;
1216 case '&':
1217 switch (c2) {
1218 case '=': return AMPEREQUAL;
1219 }
1220 break;
1221 case '^':
1222 switch (c2) {
1223 case '=': return CIRCUMFLEXEQUAL;
1224 }
1225 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001226 case '@':
1227 switch (c2) {
1228 case '=': return ATEQUAL;
1229 }
1230 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 }
1232 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001233}
1234
Thomas Wouters434d0822000-08-24 20:11:32 +00001235int
1236PyToken_ThreeChars(int c1, int c2, int c3)
1237{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 switch (c1) {
1239 case '<':
1240 switch (c2) {
1241 case '<':
1242 switch (c3) {
1243 case '=':
1244 return LEFTSHIFTEQUAL;
1245 }
1246 break;
1247 }
1248 break;
1249 case '>':
1250 switch (c2) {
1251 case '>':
1252 switch (c3) {
1253 case '=':
1254 return RIGHTSHIFTEQUAL;
1255 }
1256 break;
1257 }
1258 break;
1259 case '*':
1260 switch (c2) {
1261 case '*':
1262 switch (c3) {
1263 case '=':
1264 return DOUBLESTAREQUAL;
1265 }
1266 break;
1267 }
1268 break;
1269 case '/':
1270 switch (c2) {
1271 case '/':
1272 switch (c3) {
1273 case '=':
1274 return DOUBLESLASHEQUAL;
1275 }
1276 break;
1277 }
1278 break;
1279 case '.':
1280 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001281 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 switch (c3) {
1283 case '.':
1284 return ELLIPSIS;
1285 }
1286 break;
1287 }
1288 break;
1289 }
1290 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001291}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001292
Guido van Rossum926f13a1998-04-09 21:38:06 +00001293static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001294indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001295{
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001296 tok->done = E_TABSPACE;
1297 tok->cur = tok->inp;
1298 return ERRORTOKEN;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001299}
1300
Martin v. Löwis47383402007-08-15 07:32:56 +00001301#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001302#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001303#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304/* Verify that the identifier follows PEP 3131.
1305 All identifier strings are guaranteed to be "ready" unicode objects.
1306 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001307static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001308verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001309{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 PyObject *s;
1311 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001312 if (tok->decoding_erred)
1313 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1317 PyErr_Clear();
1318 tok->done = E_IDENTIFIER;
1319 } else {
1320 tok->done = E_ERROR;
1321 }
1322 return 0;
1323 }
1324 result = PyUnicode_IsIdentifier(s);
1325 Py_DECREF(s);
1326 if (result == 0)
1327 tok->done = E_IDENTIFIER;
1328 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001329}
1330#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001331
Brett Cannona721aba2016-09-09 14:57:09 -07001332static int
1333tok_decimal_tail(struct tok_state *tok)
1334{
1335 int c;
1336
1337 while (1) {
1338 do {
1339 c = tok_nextc(tok);
1340 } while (isdigit(c));
1341 if (c != '_') {
1342 break;
1343 }
1344 c = tok_nextc(tok);
1345 if (!isdigit(c)) {
1346 tok->done = E_TOKEN;
1347 tok_backup(tok, c);
1348 return 0;
1349 }
1350 }
1351 return c;
1352}
1353
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354/* Get next token, after space stripping etc. */
1355
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001356static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001357tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001359 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001361
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001363 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001364 tok->start = NULL;
1365 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001366
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 /* Get indentation level */
1368 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001369 int col = 0;
1370 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 tok->atbol = 0;
1372 for (;;) {
1373 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001374 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001376 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 else if (c == '\t') {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001378 col = (col / tok->tabsize + 1) * tok->tabsize;
1379 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 }
Brett Cannona721aba2016-09-09 14:57:09 -07001381 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001383 }
1384 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001386 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 }
1388 tok_backup(tok, c);
1389 if (c == '#' || c == '\n') {
1390 /* Lines with only whitespace and/or comments
1391 shouldn't affect the indentation and are
1392 not passed to the parser as NEWLINE tokens,
1393 except *totally* empty lines in interactive
1394 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001395 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001397 }
1398 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001400 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 /* We can't jump back right here since we still
1402 may need to skip to the end of a comment */
1403 }
1404 if (!blankline && tok->level == 0) {
1405 if (col == tok->indstack[tok->indent]) {
1406 /* No change */
1407 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001408 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 }
1410 }
1411 else if (col > tok->indstack[tok->indent]) {
1412 /* Indent -- always one */
1413 if (tok->indent+1 >= MAXINDENT) {
1414 tok->done = E_TOODEEP;
1415 tok->cur = tok->inp;
1416 return ERRORTOKEN;
1417 }
1418 if (altcol <= tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001419 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001420 }
1421 tok->pendin++;
1422 tok->indstack[++tok->indent] = col;
1423 tok->altindstack[tok->indent] = altcol;
1424 }
1425 else /* col < tok->indstack[tok->indent] */ {
1426 /* Dedent -- any number, must be consistent */
1427 while (tok->indent > 0 &&
1428 col < tok->indstack[tok->indent]) {
1429 tok->pendin--;
1430 tok->indent--;
1431 }
1432 if (col != tok->indstack[tok->indent]) {
1433 tok->done = E_DEDENT;
1434 tok->cur = tok->inp;
1435 return ERRORTOKEN;
1436 }
1437 if (altcol != tok->altindstack[tok->indent]) {
Victor Stinnerf2ddc6a2017-11-17 01:25:47 -08001438 return indenterror(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 }
1440 }
1441 }
1442 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001445
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001446 /* Return pending indents/dedents */
1447 if (tok->pendin != 0) {
1448 if (tok->pendin < 0) {
1449 tok->pendin++;
1450 return DEDENT;
1451 }
1452 else {
1453 tok->pendin--;
1454 return INDENT;
1455 }
1456 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001457
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001458 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001459 tok->start = NULL;
1460 /* Skip spaces */
1461 do {
1462 c = tok_nextc(tok);
1463 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001464
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 /* Set start of current token */
1466 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001468 /* Skip comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001469 if (c == '#') {
1470 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001472 }
1473 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001474
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001475 /* Check for EOF and errors now */
1476 if (c == EOF) {
1477 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1478 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 /* Identifier (most frequent token!) */
1481 nonascii = 0;
1482 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001483 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001484 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001485 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001486 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001487 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001488 /* Since this is a backwards compatibility support literal we don't
1489 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001490 else if (!(saw_b || saw_u || saw_r || saw_f)
1491 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001492 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001493 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001494 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001495 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001496 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001497 }
1498 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001499 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001500 }
1501 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001502 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001503 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001505 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001507 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001508 }
1509 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001510 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001511 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001512 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 c = tok_nextc(tok);
1514 }
1515 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001516 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001518 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001519 *p_start = tok->start;
1520 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001521
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001522 return NAME;
1523 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001524
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 /* Newline */
1526 if (c == '\n') {
1527 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001528 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001529 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001530 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 *p_start = tok->start;
1532 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1533 tok->cont_line = 0;
1534 return NEWLINE;
1535 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001536
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 /* Period or number starting with period? */
1538 if (c == '.') {
1539 c = tok_nextc(tok);
1540 if (isdigit(c)) {
1541 goto fraction;
1542 } else if (c == '.') {
1543 c = tok_nextc(tok);
1544 if (c == '.') {
1545 *p_start = tok->start;
1546 *p_end = tok->cur;
1547 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001548 }
1549 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001550 tok_backup(tok, c);
1551 }
1552 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001553 }
1554 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 tok_backup(tok, c);
1556 }
1557 *p_start = tok->start;
1558 *p_end = tok->cur;
1559 return DOT;
1560 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001561
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001562 /* Number */
1563 if (isdigit(c)) {
1564 if (c == '0') {
1565 /* Hex, octal or binary -- maybe. */
1566 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001568 /* Hex */
1569 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001570 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001571 if (c == '_') {
1572 c = tok_nextc(tok);
1573 }
1574 if (!isxdigit(c)) {
1575 tok->done = E_TOKEN;
1576 tok_backup(tok, c);
1577 return ERRORTOKEN;
1578 }
1579 do {
1580 c = tok_nextc(tok);
1581 } while (isxdigit(c));
1582 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001583 }
1584 else if (c == 'o' || c == 'O') {
1585 /* Octal */
1586 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001588 if (c == '_') {
1589 c = tok_nextc(tok);
1590 }
1591 if (c < '0' || c >= '8') {
1592 tok->done = E_TOKEN;
1593 tok_backup(tok, c);
1594 return ERRORTOKEN;
1595 }
1596 do {
1597 c = tok_nextc(tok);
1598 } while ('0' <= c && c < '8');
1599 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 }
1601 else if (c == 'b' || c == 'B') {
1602 /* Binary */
1603 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001605 if (c == '_') {
1606 c = tok_nextc(tok);
1607 }
1608 if (c != '0' && c != '1') {
1609 tok->done = E_TOKEN;
1610 tok_backup(tok, c);
1611 return ERRORTOKEN;
1612 }
1613 do {
1614 c = tok_nextc(tok);
1615 } while (c == '0' || c == '1');
1616 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 }
1618 else {
1619 int nonzero = 0;
1620 /* maybe old-style octal; c is first char of it */
1621 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001622 while (1) {
1623 if (c == '_') {
1624 c = tok_nextc(tok);
1625 if (!isdigit(c)) {
1626 tok->done = E_TOKEN;
1627 tok_backup(tok, c);
1628 return ERRORTOKEN;
1629 }
1630 }
1631 if (c != '0') {
1632 break;
1633 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 c = tok_nextc(tok);
1635 }
Brett Cannona721aba2016-09-09 14:57:09 -07001636 if (isdigit(c)) {
1637 nonzero = 1;
1638 c = tok_decimal_tail(tok);
1639 if (c == 0) {
1640 return ERRORTOKEN;
1641 }
1642 }
1643 if (c == '.') {
1644 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001646 }
1647 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001649 }
1650 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001652 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001654 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 tok->done = E_TOKEN;
1656 tok_backup(tok, c);
1657 return ERRORTOKEN;
1658 }
1659 }
1660 }
1661 else {
1662 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001663 c = tok_decimal_tail(tok);
1664 if (c == 0) {
1665 return ERRORTOKEN;
1666 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001667 {
1668 /* Accept floating point numbers. */
1669 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001670 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 fraction:
1672 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001673 if (isdigit(c)) {
1674 c = tok_decimal_tail(tok);
1675 if (c == 0) {
1676 return ERRORTOKEN;
1677 }
1678 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 }
1680 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001681 int e;
1682 exponent:
1683 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 /* Exponent part */
1685 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001686 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001688 if (!isdigit(c)) {
1689 tok->done = E_TOKEN;
1690 tok_backup(tok, c);
1691 return ERRORTOKEN;
1692 }
1693 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001694 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001695 tok_backup(tok, e);
1696 *p_start = tok->start;
1697 *p_end = tok->cur;
1698 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 }
Brett Cannona721aba2016-09-09 14:57:09 -07001700 c = tok_decimal_tail(tok);
1701 if (c == 0) {
1702 return ERRORTOKEN;
1703 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 }
Brett Cannona721aba2016-09-09 14:57:09 -07001705 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 /* Imaginary part */
1707 imaginary:
1708 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001709 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 }
1711 }
1712 tok_backup(tok, c);
1713 *p_start = tok->start;
1714 *p_end = tok->cur;
1715 return NUMBER;
1716 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001717
1718 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 /* String */
1720 if (c == '\'' || c == '"') {
1721 int quote = c;
1722 int quote_size = 1; /* 1 or 3 */
1723 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001724
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001725 /* Find the quote size and start of string */
1726 c = tok_nextc(tok);
1727 if (c == quote) {
1728 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001729 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001731 }
1732 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001734 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 }
Brett Cannona721aba2016-09-09 14:57:09 -07001736 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001738 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001739
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 /* Get rest of string */
1741 while (end_quote_size != quote_size) {
1742 c = tok_nextc(tok);
1743 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001744 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001746 }
1747 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001749 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750 tok->cur = tok->inp;
1751 return ERRORTOKEN;
1752 }
1753 if (quote_size == 1 && c == '\n') {
1754 tok->done = E_EOLS;
1755 tok->cur = tok->inp;
1756 return ERRORTOKEN;
1757 }
Brett Cannona721aba2016-09-09 14:57:09 -07001758 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001760 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001761 else {
1762 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001763 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001764 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001765 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 }
1767 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001768
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 *p_start = tok->start;
1770 *p_end = tok->cur;
1771 return STRING;
1772 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001773
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 /* Line continuation */
1775 if (c == '\\') {
1776 c = tok_nextc(tok);
1777 if (c != '\n') {
1778 tok->done = E_LINECONT;
1779 tok->cur = tok->inp;
1780 return ERRORTOKEN;
1781 }
1782 tok->cont_line = 1;
1783 goto again; /* Read next line */
1784 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001785
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001786 /* Check for two-character token */
1787 {
1788 int c2 = tok_nextc(tok);
1789 int token = PyToken_TwoChars(c, c2);
1790 if (token != OP) {
1791 int c3 = tok_nextc(tok);
1792 int token3 = PyToken_ThreeChars(c, c2, c3);
1793 if (token3 != OP) {
1794 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001795 }
1796 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001797 tok_backup(tok, c3);
1798 }
1799 *p_start = tok->start;
1800 *p_end = tok->cur;
1801 return token;
1802 }
1803 tok_backup(tok, c2);
1804 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001805
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 /* Keep track of parentheses nesting level */
1807 switch (c) {
1808 case '(':
1809 case '[':
1810 case '{':
1811 tok->level++;
1812 break;
1813 case ')':
1814 case ']':
1815 case '}':
1816 tok->level--;
1817 break;
1818 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001819
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001820 /* Punctuation character */
1821 *p_start = tok->start;
1822 *p_end = tok->cur;
1823 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001824}
1825
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001826int
1827PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1828{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001829 int result = tok_get(tok, p_start, p_end);
1830 if (tok->decoding_erred) {
1831 result = ERRORTOKEN;
1832 tok->done = E_DECODE;
1833 }
1834 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001835}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001836
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001837/* Get the encoding of a Python file. Check for the coding cookie and check if
1838 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001839
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001840 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1841 encoding in the first or second line of the file (in which case the encoding
1842 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001843
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001844 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1845 by the caller. */
1846
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001847char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001848PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001849{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 struct tok_state *tok;
1851 FILE *fp;
1852 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001853
Victor Stinnerdaf45552013-08-28 00:53:59 +02001854#ifndef PGEN
1855 fd = _Py_dup(fd);
1856#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001857 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001858#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001859 if (fd < 0) {
1860 return NULL;
1861 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001862
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001863 fp = fdopen(fd, "r");
1864 if (fp == NULL) {
1865 return NULL;
1866 }
1867 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1868 if (tok == NULL) {
1869 fclose(fp);
1870 return NULL;
1871 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001872#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001873 if (filename != NULL) {
1874 Py_INCREF(filename);
1875 tok->filename = filename;
1876 }
1877 else {
1878 tok->filename = PyUnicode_FromString("<string>");
1879 if (tok->filename == NULL) {
1880 fclose(fp);
1881 PyTokenizer_Free(tok);
1882 return encoding;
1883 }
1884 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001885#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001886 while (tok->lineno < 2 && tok->done == E_OK) {
1887 PyTokenizer_Get(tok, &p_start, &p_end);
1888 }
1889 fclose(fp);
1890 if (tok->encoding) {
1891 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1892 if (encoding)
1893 strcpy(encoding, tok->encoding);
1894 }
1895 PyTokenizer_Free(tok);
1896 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001897}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001898
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001899char *
1900PyTokenizer_FindEncoding(int fd)
1901{
1902 return PyTokenizer_FindEncodingFilename(fd, NULL);
1903}
1904
Guido van Rossum408027e1996-12-30 16:17:54 +00001905#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001906
1907void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001908tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001909{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001910 printf("%s", _PyParser_TokenNames[type]);
1911 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1912 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001913}
1914
1915#endif