blob: 90a8270c1924f7abe821f9b7b381c1a0d46f21c3 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
109 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000116tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119 sizeof(struct tok_state));
120 if (tok == NULL)
121 return NULL;
122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123 tok->done = E_OK;
124 tok->fp = NULL;
125 tok->input = NULL;
126 tok->tabsize = TABSIZE;
127 tok->indent = 0;
128 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400129
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
134 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 tok->altwarning = 1;
136 tok->alterror = 1;
137 tok->alttabsize = 1;
138 tok->altindstack[0] = 0;
139 tok->decoding_state = STATE_INIT;
140 tok->decoding_erred = 0;
141 tok->read_coding_spec = 0;
142 tok->enc = NULL;
143 tok->encoding = NULL;
144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200146 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 tok->decoding_readline = NULL;
148 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000149#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300150
151 tok->async_def = 0;
152 tok->async_def_indent = 0;
153 tok->async_def_nl = 0;
154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156}
157
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 if (!result) {
163 tok->done = E_NOMEM;
164 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 memcpy(result, s, len);
167 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000169}
170
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000171#ifdef PGEN
172
173static char *
174decoding_fgets(char *s, int size, struct tok_state *tok)
175{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177}
178
179static int
180decoding_feof(struct tok_state *tok)
181{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183}
184
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000185static char *
186decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000187{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189}
190
191#else /* PGEN */
192
193static char *
194error_ret(struct tok_state *tok) /* XXX */
195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 tok->decoding_erred = 1;
197 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200199 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200205static const char *
206get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000208 char buf[13];
209 int i;
210 for (i = 0; i < 12; i++) {
211 int c = s[i];
212 if (c == '\0')
213 break;
214 else if (c == '_')
215 buf[i] = '-';
216 else
217 buf[i] = tolower(c);
218 }
219 buf[i] = '\0';
220 if (strcmp(buf, "utf-8") == 0 ||
221 strncmp(buf, "utf-8-", 6) == 0)
222 return "utf-8";
223 else if (strcmp(buf, "latin-1") == 0 ||
224 strcmp(buf, "iso-8859-1") == 0 ||
225 strcmp(buf, "iso-latin-1") == 0 ||
226 strncmp(buf, "latin-1-", 8) == 0 ||
227 strncmp(buf, "iso-8859-1-", 11) == 0 ||
228 strncmp(buf, "iso-latin-1-", 12) == 0)
229 return "iso-8859-1";
230 else
231 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232}
233
234/* Return the coding spec in S, or NULL if none is found. */
235
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700236static int
237get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000238{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700240 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 /* Coding spec must be in a comment, and that comment must be
242 * the only statement on the source code line. */
243 for (i = 0; i < size - 6; i++) {
244 if (s[i] == '#')
245 break;
246 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 }
249 for (; i < size - 6; i++) { /* XXX inefficient search */
250 const char* t = s + i;
251 if (strncmp(t, "coding", 6) == 0) {
252 const char* begin = NULL;
253 t += 6;
254 if (t[0] != ':' && t[0] != '=')
255 continue;
256 do {
257 t++;
258 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 begin = t;
261 while (Py_ISALNUM(t[0]) ||
262 t[0] == '-' || t[0] == '_' || t[0] == '.')
263 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200267 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700268 if (!r)
269 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700270 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 if (r != q) {
272 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700273 r = new_string(q, strlen(q), tok);
274 if (!r)
275 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700277 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200278 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 }
280 }
281 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700282 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283}
284
285/* Check whether the line contains a coding spec. If it does,
286 invoke the set_readline function for the new encoding.
287 This function receives the tok_state and the new encoding.
288 Return 1 on success, 0 on failure. */
289
290static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000291check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700294 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000295 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000296
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200297 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200299 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000300 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200301 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700302 if (!get_coding_spec(line, &cs, size, tok))
303 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200304 if (!cs) {
305 Py_ssize_t i;
306 for (i = 0; i < size; i++) {
307 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
308 break;
309 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310 /* Stop checking coding spec after a line containing
311 * anything except a comment. */
312 tok->read_coding_spec = 1;
313 break;
314 }
315 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 tok->read_coding_spec = 1;
319 if (tok->encoding == NULL) {
320 assert(tok->decoding_state == STATE_RAW);
321 if (strcmp(cs, "utf-8") == 0) {
322 tok->encoding = cs;
323 } else {
324 r = set_readline(tok, cs);
325 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700327 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000328 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700329 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300330 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700331 "encoding problem: %s", cs);
332 PyMem_FREE(cs);
333 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700335 } else { /* then, compare cs with BOM */
336 r = (strcmp(tok->encoding, cs) == 0);
337 if (!r)
338 PyErr_Format(PyExc_SyntaxError,
339 "encoding problem: %s with BOM", cs);
340 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000342 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343}
344
345/* See whether the file starts with a BOM. If it does,
346 invoke the set_readline function with the new encoding.
347 Return 1 on success, 0 on failure. */
348
349static int
350check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 void unget_char(int, struct tok_state *),
352 int set_readline(struct tok_state *, const char *),
353 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000355 int ch1, ch2, ch3;
356 ch1 = get_char(tok);
357 tok->decoding_state = STATE_RAW;
358 if (ch1 == EOF) {
359 return 1;
360 } else if (ch1 == 0xEF) {
361 ch2 = get_char(tok);
362 if (ch2 != 0xBB) {
363 unget_char(ch2, tok);
364 unget_char(ch1, tok);
365 return 1;
366 }
367 ch3 = get_char(tok);
368 if (ch3 != 0xBF) {
369 unget_char(ch3, tok);
370 unget_char(ch2, tok);
371 unget_char(ch1, tok);
372 return 1;
373 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 /* Disable support for UTF-16 BOMs until a decision
376 is made whether this needs to be supported. */
377 } else if (ch1 == 0xFE) {
378 ch2 = get_char(tok);
379 if (ch2 != 0xFF) {
380 unget_char(ch2, tok);
381 unget_char(ch1, tok);
382 return 1;
383 }
384 if (!set_readline(tok, "utf-16-be"))
385 return 0;
386 tok->decoding_state = STATE_NORMAL;
387 } else if (ch1 == 0xFF) {
388 ch2 = get_char(tok);
389 if (ch2 != 0xFE) {
390 unget_char(ch2, tok);
391 unget_char(ch1, tok);
392 return 1;
393 }
394 if (!set_readline(tok, "utf-16-le"))
395 return 0;
396 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 } else {
399 unget_char(ch1, tok);
400 return 1;
401 }
402 if (tok->encoding != NULL)
403 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700404 tok->encoding = new_string("utf-8", 5, tok);
405 if (!tok->encoding)
406 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 /* No need to set_readline: input is already utf-8 */
408 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000412 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000413
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000414 On entry, tok->decoding_buffer will be one of:
415 1) NULL: need to call tok->decoding_readline to get a new line
416 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000418 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 (in the s buffer) to copy entire contents of the line read
420 by tok->decoding_readline. tok->decoding_buffer has the overflow.
421 In this case, fp_readl is called in a loop (with an expanded buffer)
422 until the buffer ends with a '\n' (or until the end of the file is
423 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426static char *
427fp_readl(char *s, int size, struct tok_state *tok)
428{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000429 PyObject* bufobj;
430 const char *buf;
431 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000433 /* Ask for one less byte so we can terminate it */
434 assert(size > 0);
435 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000436
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000437 if (tok->decoding_buffer) {
438 bufobj = tok->decoding_buffer;
439 Py_INCREF(bufobj);
440 }
441 else
442 {
443 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
444 if (bufobj == NULL)
445 goto error;
446 }
447 if (PyUnicode_CheckExact(bufobj))
448 {
449 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
450 if (buf == NULL) {
451 goto error;
452 }
453 }
454 else
455 {
456 buf = PyByteArray_AsString(bufobj);
457 if (buf == NULL) {
458 goto error;
459 }
460 buflen = PyByteArray_GET_SIZE(bufobj);
461 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000462
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 Py_XDECREF(tok->decoding_buffer);
464 if (buflen > size) {
465 /* Too many chars, the rest goes into tok->decoding_buffer */
466 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467 buflen-size);
468 if (tok->decoding_buffer == NULL)
469 goto error;
470 buflen = size;
471 }
472 else
473 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000474
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 memcpy(s, buf, buflen);
476 s[buflen] = '\0';
477 if (buflen == 0) /* EOF */
478 s = NULL;
479 Py_DECREF(bufobj);
480 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000481
482error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000483 Py_XDECREF(bufobj);
484 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485}
486
487/* Set the readline function for TOK to a StreamReader's
488 readline function. The StreamReader is named ENC.
489
490 This function is called from check_bom and check_coding_spec.
491
492 ENC is usually identical to the future value of tok->encoding,
493 except for the (currently unsupported) case of UTF-16.
494
495 Return 1 on success, 0 on failure. */
496
497static int
498fp_setreadl(struct tok_state *tok, const char* enc)
499{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000500 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200501 _Py_IDENTIFIER(open);
502 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000503 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200504 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 io = PyImport_ImportModuleNoBlock("io");
507 if (io == NULL)
508 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000509
Victor Stinner22a351a2010-10-14 12:04:34 +0000510 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200511 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100512 * position of tok->fp. If tok->fp was opened in text mode on Windows,
513 * its file position counts CRLF as one char and can't be directly mapped
514 * to the file offset for fd. Instead we step back one byte and read to
515 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200516 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100517 if (pos == -1 ||
518 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000519 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
520 goto cleanup;
521 }
522
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200523 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000524 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000525 if (stream == NULL)
526 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200528 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Serhiy Storchaka48842712016-04-06 09:45:48 +0300529 Py_XSETREF(tok->decoding_readline, readline);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100530 if (pos > 0) {
531 if (PyObject_CallObject(readline, NULL) == NULL) {
532 readline = NULL;
533 goto cleanup;
534 }
535 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000536
537 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000538 Py_XDECREF(stream);
539 Py_XDECREF(io);
540 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541}
542
543/* Fetch the next byte from TOK. */
544
545static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547}
548
549/* Unfetch the last byte back into TOK. */
550
551static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000552 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000555/* Check whether the characters at s start a valid
556 UTF-8 sequence. Return the number of characters forming
557 the sequence if yes, 0 if not. */
558static int valid_utf8(const unsigned char* s)
559{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000560 int expected = 0;
561 int length;
562 if (*s < 0x80)
563 /* single-byte code */
564 return 1;
565 if (*s < 0xc0)
566 /* following byte */
567 return 0;
568 if (*s < 0xE0)
569 expected = 1;
570 else if (*s < 0xF0)
571 expected = 2;
572 else if (*s < 0xF8)
573 expected = 3;
574 else
575 return 0;
576 length = expected + 1;
577 for (; expected; expected--)
578 if (s[expected] < 0x80 || s[expected] >= 0xC0)
579 return 0;
580 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000581}
582
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583/* Read a line of input from TOK. Determine encoding
584 if necessary. */
585
586static char *
587decoding_fgets(char *s, int size, struct tok_state *tok)
588{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000589 char *line = NULL;
590 int badchar = 0;
591 for (;;) {
592 if (tok->decoding_state == STATE_NORMAL) {
593 /* We already have a codec associated with
594 this input. */
595 line = fp_readl(s, size, tok);
596 break;
597 } else if (tok->decoding_state == STATE_RAW) {
598 /* We want a 'raw' read. */
599 line = Py_UniversalNewlineFgets(s, size,
600 tok->fp, NULL);
601 break;
602 } else {
603 /* We have not yet determined the encoding.
604 If an encoding is found, use the file-pointer
605 reader functions from now on. */
606 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
607 return error_ret(tok);
608 assert(tok->decoding_state != STATE_INIT);
609 }
610 }
611 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
612 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
613 return error_ret(tok);
614 }
615 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000616#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000617 /* The default encoding is UTF-8, so make sure we don't have any
618 non-UTF-8 sequences in it. */
619 if (line && !tok->encoding) {
620 unsigned char *c;
621 int length;
622 for (c = (unsigned char *)line; *c; c += length)
623 if (!(length = valid_utf8(c))) {
624 badchar = *c;
625 break;
626 }
627 }
628 if (badchar) {
629 /* Need to add 1 to the line number, since this line
630 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200631 PyErr_Format(PyExc_SyntaxError,
632 "Non-UTF-8 code starting with '\\x%.2x' "
633 "in file %U on line %i, "
634 "but no encoding declared; "
635 "see http://python.org/dev/peps/pep-0263/ for details",
636 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return error_ret(tok);
638 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641}
642
643static int
644decoding_feof(struct tok_state *tok)
645{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 if (tok->decoding_state != STATE_NORMAL) {
647 return feof(tok->fp);
648 } else {
649 PyObject* buf = tok->decoding_buffer;
650 if (buf == NULL) {
651 buf = PyObject_CallObject(tok->decoding_readline, NULL);
652 if (buf == NULL) {
653 error_ret(tok);
654 return 1;
655 } else {
656 tok->decoding_buffer = buf;
657 }
658 }
659 return PyObject_Length(buf) == 0;
660 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661}
662
663/* Fetch a byte from TOK, using the string buffer. */
664
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000665static int
666buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000667 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668}
669
670/* Unfetch a byte from TOK, using the string buffer. */
671
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000672static void
673buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 tok->str--;
675 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676}
677
678/* Set the readline function for TOK to ENC. For the string-based
679 tokenizer, this means to just record the encoding. */
680
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000681static int
682buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 tok->enc = enc;
684 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000685}
686
687/* Return a UTF-8 encoding Python string object from the
688 C byte string STR, which is encoded with ENC. */
689
690static PyObject *
691translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000692 PyObject *utf8;
693 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
694 if (buf == NULL)
695 return NULL;
696 utf8 = PyUnicode_AsUTF8String(buf);
697 Py_DECREF(buf);
698 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699}
700
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000701
702static char *
703translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200704 int skip_next_lf = 0;
705 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 char *buf, *current;
707 char c = '\0';
708 buf = PyMem_MALLOC(needed_length);
709 if (buf == NULL) {
710 tok->done = E_NOMEM;
711 return NULL;
712 }
713 for (current = buf; *s; s++, current++) {
714 c = *s;
715 if (skip_next_lf) {
716 skip_next_lf = 0;
717 if (c == '\n') {
718 c = *++s;
719 if (!c)
720 break;
721 }
722 }
723 if (c == '\r') {
724 skip_next_lf = 1;
725 c = '\n';
726 }
727 *current = c;
728 }
729 /* If this is exec input, add a newline to the end of the string if
730 there isn't one already. */
731 if (exec_input && c != '\n') {
732 *current = '\n';
733 current++;
734 }
735 *current = '\0';
736 final_length = current - buf + 1;
737 if (final_length < needed_length && final_length)
738 /* should never fail */
739 buf = PyMem_REALLOC(buf, final_length);
740 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000741}
742
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000743/* Decode a byte string STR for use as the buffer of TOK.
744 Look for encoding declarations inside STR, and record them
745 inside TOK. */
746
747static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000748decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000749{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 PyObject* utf8 = NULL;
751 const char *str;
752 const char *s;
753 const char *newl[2] = {NULL, NULL};
754 int lineno = 0;
755 tok->input = str = translate_newlines(input, single, tok);
756 if (str == NULL)
757 return NULL;
758 tok->enc = NULL;
759 tok->str = str;
760 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
761 return error_ret(tok);
762 str = tok->str; /* string after BOM if any */
763 assert(str);
764 if (tok->enc != NULL) {
765 utf8 = translate_into_utf8(str, tok->enc);
766 if (utf8 == NULL)
767 return error_ret(tok);
768 str = PyBytes_AsString(utf8);
769 }
770 for (s = str;; s++) {
771 if (*s == '\0') break;
772 else if (*s == '\n') {
773 assert(lineno < 2);
774 newl[lineno] = s;
775 lineno++;
776 if (lineno == 2) break;
777 }
778 }
779 tok->enc = NULL;
780 /* need to check line 1 and 2 separately since check_coding_spec
781 assumes a single line as input */
782 if (newl[0]) {
783 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
784 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200785 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
787 tok, buf_setreadl))
788 return error_ret(tok);
789 }
790 }
791 if (tok->enc != NULL) {
792 assert(utf8 == NULL);
793 utf8 = translate_into_utf8(str, tok->enc);
794 if (utf8 == NULL)
795 return error_ret(tok);
796 str = PyBytes_AS_STRING(utf8);
797 }
798 assert(tok->decoding_buffer == NULL);
799 tok->decoding_buffer = utf8; /* CAUTION */
800 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000801}
802
803#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000804
805/* Set up tokenizer for string */
806
807struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000808PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 struct tok_state *tok = tok_new();
811 if (tok == NULL)
812 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300813 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 if (str == NULL) {
815 PyTokenizer_Free(tok);
816 return NULL;
817 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000818
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 /* XXX: constify members. */
820 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
821 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822}
823
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000824struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000825PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000826{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 struct tok_state *tok = tok_new();
828 if (tok == NULL)
829 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000830#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000832#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 if (str == NULL) {
834 PyTokenizer_Free(tok);
835 return NULL;
836 }
837 tok->decoding_state = STATE_RAW;
838 tok->read_coding_spec = 1;
839 tok->enc = NULL;
840 tok->str = str;
841 tok->encoding = (char *)PyMem_MALLOC(6);
842 if (!tok->encoding) {
843 PyTokenizer_Free(tok);
844 return NULL;
845 }
846 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000847
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 /* XXX: constify members. */
849 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
850 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000851}
852
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000853/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854
855struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300856PyTokenizer_FromFile(FILE *fp, const char* enc,
857 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000859 struct tok_state *tok = tok_new();
860 if (tok == NULL)
861 return NULL;
862 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
863 PyTokenizer_Free(tok);
864 return NULL;
865 }
866 tok->cur = tok->inp = tok->buf;
867 tok->end = tok->buf + BUFSIZ;
868 tok->fp = fp;
869 tok->prompt = ps1;
870 tok->nextprompt = ps2;
871 if (enc != NULL) {
872 /* Must copy encoding declaration since it
873 gets copied into the parse tree. */
874 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
875 if (!tok->encoding) {
876 PyTokenizer_Free(tok);
877 return NULL;
878 }
879 strcpy(tok->encoding, enc);
880 tok->decoding_state = STATE_NORMAL;
881 }
882 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883}
884
885
886/* Free a tok_state structure */
887
888void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000889PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000890{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000891 if (tok->encoding != NULL)
892 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000893#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894 Py_XDECREF(tok->decoding_readline);
895 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200896 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000897#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 if (tok->fp != NULL && tok->buf != NULL)
899 PyMem_FREE(tok->buf);
900 if (tok->input)
901 PyMem_FREE((char *)tok->input);
902 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000903}
904
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905/* Get next char, updating state; error code goes into tok->done */
906
907static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200908tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910 for (;;) {
911 if (tok->cur != tok->inp) {
912 return Py_CHARMASK(*tok->cur++); /* Fast path */
913 }
914 if (tok->done != E_OK)
915 return EOF;
916 if (tok->fp == NULL) {
917 char *end = strchr(tok->inp, '\n');
918 if (end != NULL)
919 end++;
920 else {
921 end = strchr(tok->inp, '\0');
922 if (end == tok->inp) {
923 tok->done = E_EOF;
924 return EOF;
925 }
926 }
927 if (tok->start == NULL)
928 tok->buf = tok->cur;
929 tok->line_start = tok->cur;
930 tok->lineno++;
931 tok->inp = end;
932 return Py_CHARMASK(*tok->cur++);
933 }
934 if (tok->prompt != NULL) {
935 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000936#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000937 if (newtok != NULL) {
938 char *translated = translate_newlines(newtok, 0, tok);
939 PyMem_FREE(newtok);
940 if (translated == NULL)
941 return EOF;
942 newtok = translated;
943 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000944 if (tok->encoding && newtok && *newtok) {
945 /* Recode to UTF-8 */
946 Py_ssize_t buflen;
947 const char* buf;
948 PyObject *u = translate_into_utf8(newtok, tok->encoding);
949 PyMem_FREE(newtok);
950 if (!u) {
951 tok->done = E_DECODE;
952 return EOF;
953 }
954 buflen = PyBytes_GET_SIZE(u);
955 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000956 newtok = PyMem_MALLOC(buflen+1);
957 strcpy(newtok, buf);
958 Py_DECREF(u);
959 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000960#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000961 if (tok->nextprompt != NULL)
962 tok->prompt = tok->nextprompt;
963 if (newtok == NULL)
964 tok->done = E_INTR;
965 else if (*newtok == '\0') {
966 PyMem_FREE(newtok);
967 tok->done = E_EOF;
968 }
969 else if (tok->start != NULL) {
970 size_t start = tok->start - tok->buf;
971 size_t oldlen = tok->cur - tok->buf;
972 size_t newlen = oldlen + strlen(newtok);
973 char *buf = tok->buf;
974 buf = (char *)PyMem_REALLOC(buf, newlen+1);
975 tok->lineno++;
976 if (buf == NULL) {
977 PyMem_FREE(tok->buf);
978 tok->buf = NULL;
979 PyMem_FREE(newtok);
980 tok->done = E_NOMEM;
981 return EOF;
982 }
983 tok->buf = buf;
984 tok->cur = tok->buf + oldlen;
985 tok->line_start = tok->cur;
986 strcpy(tok->buf + oldlen, newtok);
987 PyMem_FREE(newtok);
988 tok->inp = tok->buf + newlen;
989 tok->end = tok->inp + 1;
990 tok->start = tok->buf + start;
991 }
992 else {
993 tok->lineno++;
994 if (tok->buf != NULL)
995 PyMem_FREE(tok->buf);
996 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000997 tok->cur = tok->buf;
998 tok->line_start = tok->buf;
999 tok->inp = strchr(tok->buf, '\0');
1000 tok->end = tok->inp + 1;
1001 }
1002 }
1003 else {
1004 int done = 0;
1005 Py_ssize_t cur = 0;
1006 char *pt;
1007 if (tok->start == NULL) {
1008 if (tok->buf == NULL) {
1009 tok->buf = (char *)
1010 PyMem_MALLOC(BUFSIZ);
1011 if (tok->buf == NULL) {
1012 tok->done = E_NOMEM;
1013 return EOF;
1014 }
1015 tok->end = tok->buf + BUFSIZ;
1016 }
1017 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1018 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001019 if (!tok->decoding_erred)
1020 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 done = 1;
1022 }
1023 else {
1024 tok->done = E_OK;
1025 tok->inp = strchr(tok->buf, '\0');
1026 done = tok->inp[-1] == '\n';
1027 }
1028 }
1029 else {
1030 cur = tok->cur - tok->buf;
1031 if (decoding_feof(tok)) {
1032 tok->done = E_EOF;
1033 done = 1;
1034 }
1035 else
1036 tok->done = E_OK;
1037 }
1038 tok->lineno++;
1039 /* Read until '\n' or EOF */
1040 while (!done) {
1041 Py_ssize_t curstart = tok->start == NULL ? -1 :
1042 tok->start - tok->buf;
1043 Py_ssize_t curvalid = tok->inp - tok->buf;
1044 Py_ssize_t newsize = curvalid + BUFSIZ;
1045 char *newbuf = tok->buf;
1046 newbuf = (char *)PyMem_REALLOC(newbuf,
1047 newsize);
1048 if (newbuf == NULL) {
1049 tok->done = E_NOMEM;
1050 tok->cur = tok->inp;
1051 return EOF;
1052 }
1053 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001054 tok->cur = tok->buf + cur;
1055 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 tok->inp = tok->buf + curvalid;
1057 tok->end = tok->buf + newsize;
1058 tok->start = curstart < 0 ? NULL :
1059 tok->buf + curstart;
1060 if (decoding_fgets(tok->inp,
1061 (int)(tok->end - tok->inp),
1062 tok) == NULL) {
1063 /* Break out early on decoding
1064 errors, as tok->buf will be NULL
1065 */
1066 if (tok->decoding_erred)
1067 return EOF;
1068 /* Last line does not end in \n,
1069 fake one */
1070 strcpy(tok->inp, "\n");
1071 }
1072 tok->inp = strchr(tok->inp, '\0');
1073 done = tok->inp[-1] == '\n';
1074 }
1075 if (tok->buf != NULL) {
1076 tok->cur = tok->buf + cur;
1077 tok->line_start = tok->cur;
1078 /* replace "\r\n" with "\n" */
1079 /* For Mac leave the \r, giving a syntax error */
1080 pt = tok->inp - 2;
1081 if (pt >= tok->buf && *pt == '\r') {
1082 *pt++ = '\n';
1083 *pt = '\0';
1084 tok->inp = pt;
1085 }
1086 }
1087 }
1088 if (tok->done != E_OK) {
1089 if (tok->prompt != NULL)
1090 PySys_WriteStderr("\n");
1091 tok->cur = tok->inp;
1092 return EOF;
1093 }
1094 }
1095 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001096}
1097
1098
1099/* Back-up one character */
1100
1101static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001102tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001104 if (c != EOF) {
1105 if (--tok->cur < tok->buf)
1106 Py_FatalError("tok_backup: beginning of buffer");
1107 if (*tok->cur != c)
1108 *tok->cur = c;
1109 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001110}
1111
1112
1113/* Return the token corresponding to a single character */
1114
1115int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001116PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 switch (c) {
1119 case '(': return LPAR;
1120 case ')': return RPAR;
1121 case '[': return LSQB;
1122 case ']': return RSQB;
1123 case ':': return COLON;
1124 case ',': return COMMA;
1125 case ';': return SEMI;
1126 case '+': return PLUS;
1127 case '-': return MINUS;
1128 case '*': return STAR;
1129 case '/': return SLASH;
1130 case '|': return VBAR;
1131 case '&': return AMPER;
1132 case '<': return LESS;
1133 case '>': return GREATER;
1134 case '=': return EQUAL;
1135 case '.': return DOT;
1136 case '%': return PERCENT;
1137 case '{': return LBRACE;
1138 case '}': return RBRACE;
1139 case '^': return CIRCUMFLEX;
1140 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001141 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 default: return OP;
1143 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144}
1145
1146
Guido van Rossumfbab9051991-10-20 20:25:03 +00001147int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001148PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001149{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001150 switch (c1) {
1151 case '=':
1152 switch (c2) {
1153 case '=': return EQEQUAL;
1154 }
1155 break;
1156 case '!':
1157 switch (c2) {
1158 case '=': return NOTEQUAL;
1159 }
1160 break;
1161 case '<':
1162 switch (c2) {
1163 case '>': return NOTEQUAL;
1164 case '=': return LESSEQUAL;
1165 case '<': return LEFTSHIFT;
1166 }
1167 break;
1168 case '>':
1169 switch (c2) {
1170 case '=': return GREATEREQUAL;
1171 case '>': return RIGHTSHIFT;
1172 }
1173 break;
1174 case '+':
1175 switch (c2) {
1176 case '=': return PLUSEQUAL;
1177 }
1178 break;
1179 case '-':
1180 switch (c2) {
1181 case '=': return MINEQUAL;
1182 case '>': return RARROW;
1183 }
1184 break;
1185 case '*':
1186 switch (c2) {
1187 case '*': return DOUBLESTAR;
1188 case '=': return STAREQUAL;
1189 }
1190 break;
1191 case '/':
1192 switch (c2) {
1193 case '/': return DOUBLESLASH;
1194 case '=': return SLASHEQUAL;
1195 }
1196 break;
1197 case '|':
1198 switch (c2) {
1199 case '=': return VBAREQUAL;
1200 }
1201 break;
1202 case '%':
1203 switch (c2) {
1204 case '=': return PERCENTEQUAL;
1205 }
1206 break;
1207 case '&':
1208 switch (c2) {
1209 case '=': return AMPEREQUAL;
1210 }
1211 break;
1212 case '^':
1213 switch (c2) {
1214 case '=': return CIRCUMFLEXEQUAL;
1215 }
1216 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001217 case '@':
1218 switch (c2) {
1219 case '=': return ATEQUAL;
1220 }
1221 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 }
1223 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001224}
1225
Thomas Wouters434d0822000-08-24 20:11:32 +00001226int
1227PyToken_ThreeChars(int c1, int c2, int c3)
1228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 switch (c1) {
1230 case '<':
1231 switch (c2) {
1232 case '<':
1233 switch (c3) {
1234 case '=':
1235 return LEFTSHIFTEQUAL;
1236 }
1237 break;
1238 }
1239 break;
1240 case '>':
1241 switch (c2) {
1242 case '>':
1243 switch (c3) {
1244 case '=':
1245 return RIGHTSHIFTEQUAL;
1246 }
1247 break;
1248 }
1249 break;
1250 case '*':
1251 switch (c2) {
1252 case '*':
1253 switch (c3) {
1254 case '=':
1255 return DOUBLESTAREQUAL;
1256 }
1257 break;
1258 }
1259 break;
1260 case '/':
1261 switch (c2) {
1262 case '/':
1263 switch (c3) {
1264 case '=':
1265 return DOUBLESLASHEQUAL;
1266 }
1267 break;
1268 }
1269 break;
1270 case '.':
1271 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001272 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 switch (c3) {
1274 case '.':
1275 return ELLIPSIS;
1276 }
1277 break;
1278 }
1279 break;
1280 }
1281 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001282}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001283
Guido van Rossum926f13a1998-04-09 21:38:06 +00001284static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001285indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001286{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 if (tok->alterror) {
1288 tok->done = E_TABSPACE;
1289 tok->cur = tok->inp;
1290 return 1;
1291 }
1292 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001293#ifdef PGEN
1294 PySys_WriteStderr("inconsistent use of tabs and spaces "
1295 "in indentation\n");
1296#else
1297 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001299#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 tok->altwarning = 0;
1301 }
1302 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001303}
1304
Martin v. Löwis47383402007-08-15 07:32:56 +00001305#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001306#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001307#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308/* Verify that the identifier follows PEP 3131.
1309 All identifier strings are guaranteed to be "ready" unicode objects.
1310 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001311static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001312verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001313{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 PyObject *s;
1315 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001316 if (tok->decoding_erred)
1317 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1321 PyErr_Clear();
1322 tok->done = E_IDENTIFIER;
1323 } else {
1324 tok->done = E_ERROR;
1325 }
1326 return 0;
1327 }
1328 result = PyUnicode_IsIdentifier(s);
1329 Py_DECREF(s);
1330 if (result == 0)
1331 tok->done = E_IDENTIFIER;
1332 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001333}
1334#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001335
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336/* Get next token, after space stripping etc. */
1337
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001338static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001339tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001340{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001341 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001343
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001345 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 tok->start = NULL;
1347 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001348
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 /* Get indentation level */
1350 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001351 int col = 0;
1352 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001353 tok->atbol = 0;
1354 for (;;) {
1355 c = tok_nextc(tok);
1356 if (c == ' ')
1357 col++, altcol++;
1358 else if (c == '\t') {
1359 col = (col/tok->tabsize + 1) * tok->tabsize;
1360 altcol = (altcol/tok->alttabsize + 1)
1361 * tok->alttabsize;
1362 }
1363 else if (c == '\014') /* Control-L (formfeed) */
1364 col = altcol = 0; /* For Emacs users */
1365 else
1366 break;
1367 }
1368 tok_backup(tok, c);
1369 if (c == '#' || c == '\n') {
1370 /* Lines with only whitespace and/or comments
1371 shouldn't affect the indentation and are
1372 not passed to the parser as NEWLINE tokens,
1373 except *totally* empty lines in interactive
1374 mode, which signal the end of a command group. */
1375 if (col == 0 && c == '\n' && tok->prompt != NULL)
1376 blankline = 0; /* Let it through */
1377 else
1378 blankline = 1; /* Ignore completely */
1379 /* We can't jump back right here since we still
1380 may need to skip to the end of a comment */
1381 }
1382 if (!blankline && tok->level == 0) {
1383 if (col == tok->indstack[tok->indent]) {
1384 /* No change */
1385 if (altcol != tok->altindstack[tok->indent]) {
1386 if (indenterror(tok))
1387 return ERRORTOKEN;
1388 }
1389 }
1390 else if (col > tok->indstack[tok->indent]) {
1391 /* Indent -- always one */
1392 if (tok->indent+1 >= MAXINDENT) {
1393 tok->done = E_TOODEEP;
1394 tok->cur = tok->inp;
1395 return ERRORTOKEN;
1396 }
1397 if (altcol <= tok->altindstack[tok->indent]) {
1398 if (indenterror(tok))
1399 return ERRORTOKEN;
1400 }
1401 tok->pendin++;
1402 tok->indstack[++tok->indent] = col;
1403 tok->altindstack[tok->indent] = altcol;
1404 }
1405 else /* col < tok->indstack[tok->indent] */ {
1406 /* Dedent -- any number, must be consistent */
1407 while (tok->indent > 0 &&
1408 col < tok->indstack[tok->indent]) {
1409 tok->pendin--;
1410 tok->indent--;
1411 }
1412 if (col != tok->indstack[tok->indent]) {
1413 tok->done = E_DEDENT;
1414 tok->cur = tok->inp;
1415 return ERRORTOKEN;
1416 }
1417 if (altcol != tok->altindstack[tok->indent]) {
1418 if (indenterror(tok))
1419 return ERRORTOKEN;
1420 }
1421 }
1422 }
1423 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 /* Return pending indents/dedents */
1428 if (tok->pendin != 0) {
1429 if (tok->pendin < 0) {
1430 tok->pendin++;
1431 return DEDENT;
1432 }
1433 else {
1434 tok->pendin--;
1435 return INDENT;
1436 }
1437 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001438
Yury Selivanov96ec9342015-07-23 15:01:58 +03001439 if (tok->async_def
1440 && !blankline
1441 && tok->level == 0
1442 /* There was a NEWLINE after ASYNC DEF,
1443 so we're past the signature. */
1444 && tok->async_def_nl
1445 /* Current indentation level is less than where
1446 the async function was defined */
1447 && tok->async_def_indent >= tok->indent)
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001448 {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001449 tok->async_def = 0;
1450 tok->async_def_indent = 0;
1451 tok->async_def_nl = 0;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001452 }
1453
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001454 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001455 tok->start = NULL;
1456 /* Skip spaces */
1457 do {
1458 c = tok_nextc(tok);
1459 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 /* Set start of current token */
1462 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 /* Skip comment */
1465 if (c == '#')
1466 while (c != EOF && c != '\n')
1467 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001468
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 /* Check for EOF and errors now */
1470 if (c == EOF) {
1471 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001474 /* Identifier (most frequent token!) */
1475 nonascii = 0;
1476 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001477 /* Process b"", r"", u"", br"" and rb"" */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001478 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001479 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001480 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001481 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001482 /* Since this is a backwards compatibility support literal we don't
1483 want to support it in arbitrary order like byte literals. */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001484 else if (!(saw_b || saw_u || saw_r || saw_f) && (c == 'u' || c == 'U'))
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001485 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001486 /* ur"" and ru"" are not supported */
1487 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001488 saw_r = 1;
Eric V. Smith235a6f02015-09-19 14:51:32 -04001489 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F'))
1490 saw_f = 1;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001491 else
1492 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 c = tok_nextc(tok);
1494 if (c == '"' || c == '\'')
1495 goto letter_quote;
1496 }
1497 while (is_potential_identifier_char(c)) {
1498 if (c >= 128)
1499 nonascii = 1;
1500 c = tok_nextc(tok);
1501 }
1502 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001503 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 *p_start = tok->start;
1506 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001507
Yury Selivanov96ec9342015-07-23 15:01:58 +03001508 /* async/await parsing block. */
1509 if (tok->cur - tok->start == 5) {
1510 /* Current token length is 5. */
1511 if (tok->async_def) {
1512 /* We're inside an 'async def' function. */
1513 if (memcmp(tok->start, "async", 5) == 0)
1514 return ASYNC;
1515 if (memcmp(tok->start, "await", 5) == 0)
1516 return AWAIT;
Yury Selivanov75445082015-05-11 22:57:16 -04001517 }
Yury Selivanov96ec9342015-07-23 15:01:58 +03001518 else if (memcmp(tok->start, "async", 5) == 0) {
1519 /* The current token is 'async'.
1520 Look ahead one token.*/
Yury Selivanov8085b802015-05-18 12:50:52 -04001521
Yury Selivanov96ec9342015-07-23 15:01:58 +03001522 struct tok_state ahead_tok;
1523 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1524 int ahead_tok_kind;
Yury Selivanov8085b802015-05-18 12:50:52 -04001525
Yury Selivanov75445082015-05-11 22:57:16 -04001526 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
Yury Selivanov75445082015-05-11 22:57:16 -04001527 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
Yury Selivanov96ec9342015-07-23 15:01:58 +03001528 &ahead_tok_end);
Yury Selivanov75445082015-05-11 22:57:16 -04001529
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001530 if (ahead_tok_kind == NAME
1531 && ahead_tok.cur - ahead_tok.start == 3
1532 && memcmp(ahead_tok.start, "def", 3) == 0)
1533 {
1534 /* The next token is going to be 'def', so instead of
1535 returning 'async' NAME token, we return ASYNC. */
Yury Selivanov96ec9342015-07-23 15:01:58 +03001536 tok->async_def_indent = tok->indent;
1537 tok->async_def = 1;
Yury Selivanov75445082015-05-11 22:57:16 -04001538 return ASYNC;
1539 }
Yury Selivanov75445082015-05-11 22:57:16 -04001540 }
1541 }
1542
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 return NAME;
1544 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546 /* Newline */
1547 if (c == '\n') {
1548 tok->atbol = 1;
1549 if (blankline || tok->level > 0)
1550 goto nextline;
1551 *p_start = tok->start;
1552 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1553 tok->cont_line = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +03001554 if (tok->async_def) {
1555 /* We're somewhere inside an 'async def' function, and
1556 we've encountered a NEWLINE after its signature. */
1557 tok->async_def_nl = 1;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001558 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 return NEWLINE;
1560 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001561
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001562 /* Period or number starting with period? */
1563 if (c == '.') {
1564 c = tok_nextc(tok);
1565 if (isdigit(c)) {
1566 goto fraction;
1567 } else if (c == '.') {
1568 c = tok_nextc(tok);
1569 if (c == '.') {
1570 *p_start = tok->start;
1571 *p_end = tok->cur;
1572 return ELLIPSIS;
1573 } else {
1574 tok_backup(tok, c);
1575 }
1576 tok_backup(tok, '.');
1577 } else {
1578 tok_backup(tok, c);
1579 }
1580 *p_start = tok->start;
1581 *p_end = tok->cur;
1582 return DOT;
1583 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001584
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 /* Number */
1586 if (isdigit(c)) {
1587 if (c == '0') {
1588 /* Hex, octal or binary -- maybe. */
1589 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001591
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 /* Hex */
1593 c = tok_nextc(tok);
1594 if (!isxdigit(c)) {
1595 tok->done = E_TOKEN;
1596 tok_backup(tok, c);
1597 return ERRORTOKEN;
1598 }
1599 do {
1600 c = tok_nextc(tok);
1601 } while (isxdigit(c));
1602 }
1603 else if (c == 'o' || c == 'O') {
1604 /* Octal */
1605 c = tok_nextc(tok);
1606 if (c < '0' || c >= '8') {
1607 tok->done = E_TOKEN;
1608 tok_backup(tok, c);
1609 return ERRORTOKEN;
1610 }
1611 do {
1612 c = tok_nextc(tok);
1613 } while ('0' <= c && c < '8');
1614 }
1615 else if (c == 'b' || c == 'B') {
1616 /* Binary */
1617 c = tok_nextc(tok);
1618 if (c != '0' && c != '1') {
1619 tok->done = E_TOKEN;
1620 tok_backup(tok, c);
1621 return ERRORTOKEN;
1622 }
1623 do {
1624 c = tok_nextc(tok);
1625 } while (c == '0' || c == '1');
1626 }
1627 else {
1628 int nonzero = 0;
1629 /* maybe old-style octal; c is first char of it */
1630 /* in any case, allow '0' as a literal */
1631 while (c == '0')
1632 c = tok_nextc(tok);
1633 while (isdigit(c)) {
1634 nonzero = 1;
1635 c = tok_nextc(tok);
1636 }
1637 if (c == '.')
1638 goto fraction;
1639 else if (c == 'e' || c == 'E')
1640 goto exponent;
1641 else if (c == 'j' || c == 'J')
1642 goto imaginary;
1643 else if (nonzero) {
1644 tok->done = E_TOKEN;
1645 tok_backup(tok, c);
1646 return ERRORTOKEN;
1647 }
1648 }
1649 }
1650 else {
1651 /* Decimal */
1652 do {
1653 c = tok_nextc(tok);
1654 } while (isdigit(c));
1655 {
1656 /* Accept floating point numbers. */
1657 if (c == '.') {
1658 fraction:
1659 /* Fraction */
1660 do {
1661 c = tok_nextc(tok);
1662 } while (isdigit(c));
1663 }
1664 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001665 int e;
1666 exponent:
1667 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 /* Exponent part */
1669 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001670 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001672 if (!isdigit(c)) {
1673 tok->done = E_TOKEN;
1674 tok_backup(tok, c);
1675 return ERRORTOKEN;
1676 }
1677 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001679 tok_backup(tok, e);
1680 *p_start = tok->start;
1681 *p_end = tok->cur;
1682 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 }
1684 do {
1685 c = tok_nextc(tok);
1686 } while (isdigit(c));
1687 }
1688 if (c == 'j' || c == 'J')
1689 /* Imaginary part */
1690 imaginary:
1691 c = tok_nextc(tok);
1692 }
1693 }
1694 tok_backup(tok, c);
1695 *p_start = tok->start;
1696 *p_end = tok->cur;
1697 return NUMBER;
1698 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001699
1700 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 /* String */
1702 if (c == '\'' || c == '"') {
1703 int quote = c;
1704 int quote_size = 1; /* 1 or 3 */
1705 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001706
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 /* Find the quote size and start of string */
1708 c = tok_nextc(tok);
1709 if (c == quote) {
1710 c = tok_nextc(tok);
1711 if (c == quote)
1712 quote_size = 3;
1713 else
1714 end_quote_size = 1; /* empty string found */
1715 }
1716 if (c != quote)
1717 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001718
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 /* Get rest of string */
1720 while (end_quote_size != quote_size) {
1721 c = tok_nextc(tok);
1722 if (c == EOF) {
1723 if (quote_size == 3)
1724 tok->done = E_EOFS;
1725 else
1726 tok->done = E_EOLS;
1727 tok->cur = tok->inp;
1728 return ERRORTOKEN;
1729 }
1730 if (quote_size == 1 && c == '\n') {
1731 tok->done = E_EOLS;
1732 tok->cur = tok->inp;
1733 return ERRORTOKEN;
1734 }
1735 if (c == quote)
1736 end_quote_size += 1;
1737 else {
1738 end_quote_size = 0;
1739 if (c == '\\')
Eric V. Smith6408dc82015-09-12 18:53:36 -04001740 c = tok_nextc(tok); /* skip escaped char */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001741 }
1742 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001743
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 *p_start = tok->start;
1745 *p_end = tok->cur;
1746 return STRING;
1747 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001748
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 /* Line continuation */
1750 if (c == '\\') {
1751 c = tok_nextc(tok);
1752 if (c != '\n') {
1753 tok->done = E_LINECONT;
1754 tok->cur = tok->inp;
1755 return ERRORTOKEN;
1756 }
1757 tok->cont_line = 1;
1758 goto again; /* Read next line */
1759 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001760
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001761 /* Check for two-character token */
1762 {
1763 int c2 = tok_nextc(tok);
1764 int token = PyToken_TwoChars(c, c2);
1765 if (token != OP) {
1766 int c3 = tok_nextc(tok);
1767 int token3 = PyToken_ThreeChars(c, c2, c3);
1768 if (token3 != OP) {
1769 token = token3;
1770 } else {
1771 tok_backup(tok, c3);
1772 }
1773 *p_start = tok->start;
1774 *p_end = tok->cur;
1775 return token;
1776 }
1777 tok_backup(tok, c2);
1778 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001779
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001780 /* Keep track of parentheses nesting level */
1781 switch (c) {
1782 case '(':
1783 case '[':
1784 case '{':
1785 tok->level++;
1786 break;
1787 case ')':
1788 case ']':
1789 case '}':
1790 tok->level--;
1791 break;
1792 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001793
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001794 /* Punctuation character */
1795 *p_start = tok->start;
1796 *p_end = tok->cur;
1797 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001798}
1799
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001800int
1801PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1802{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001803 int result = tok_get(tok, p_start, p_end);
1804 if (tok->decoding_erred) {
1805 result = ERRORTOKEN;
1806 tok->done = E_DECODE;
1807 }
1808 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001809}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001810
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001811/* Get the encoding of a Python file. Check for the coding cookie and check if
1812 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001813
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001814 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1815 encoding in the first or second line of the file (in which case the encoding
1816 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001817
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001818 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1819 by the caller. */
1820
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001821char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001822PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001823{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001824 struct tok_state *tok;
1825 FILE *fp;
1826 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001827
Victor Stinnerdaf45552013-08-28 00:53:59 +02001828#ifndef PGEN
1829 fd = _Py_dup(fd);
1830#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001831 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001832#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001833 if (fd < 0) {
1834 return NULL;
1835 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001836
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001837 fp = fdopen(fd, "r");
1838 if (fp == NULL) {
1839 return NULL;
1840 }
1841 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1842 if (tok == NULL) {
1843 fclose(fp);
1844 return NULL;
1845 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001846#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001847 if (filename != NULL) {
1848 Py_INCREF(filename);
1849 tok->filename = filename;
1850 }
1851 else {
1852 tok->filename = PyUnicode_FromString("<string>");
1853 if (tok->filename == NULL) {
1854 fclose(fp);
1855 PyTokenizer_Free(tok);
1856 return encoding;
1857 }
1858 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001859#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001860 while (tok->lineno < 2 && tok->done == E_OK) {
1861 PyTokenizer_Get(tok, &p_start, &p_end);
1862 }
1863 fclose(fp);
1864 if (tok->encoding) {
1865 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1866 if (encoding)
1867 strcpy(encoding, tok->encoding);
1868 }
1869 PyTokenizer_Free(tok);
1870 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001871}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001872
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001873char *
1874PyTokenizer_FindEncoding(int fd)
1875{
1876 return PyTokenizer_FindEncodingFilename(fd, NULL);
1877}
1878
Guido van Rossum408027e1996-12-30 16:17:54 +00001879#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001880
1881void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001882tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001883{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001884 printf("%s", _PyParser_TokenNames[type]);
1885 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1886 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001887}
1888
1889#endif