blob: be7cf497c441efaf77fa3322ffb4d009c664e3e6 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
109 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000116tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119 sizeof(struct tok_state));
120 if (tok == NULL)
121 return NULL;
122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123 tok->done = E_OK;
124 tok->fp = NULL;
125 tok->input = NULL;
126 tok->tabsize = TABSIZE;
127 tok->indent = 0;
128 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400129
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
134 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 tok->altwarning = 1;
136 tok->alterror = 1;
137 tok->alttabsize = 1;
138 tok->altindstack[0] = 0;
139 tok->decoding_state = STATE_INIT;
140 tok->decoding_erred = 0;
141 tok->read_coding_spec = 0;
142 tok->enc = NULL;
143 tok->encoding = NULL;
144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200146 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 tok->decoding_readline = NULL;
148 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000149#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300150
151 tok->async_def = 0;
152 tok->async_def_indent = 0;
153 tok->async_def_nl = 0;
154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156}
157
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 if (!result) {
163 tok->done = E_NOMEM;
164 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 memcpy(result, s, len);
167 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000169}
170
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000171#ifdef PGEN
172
173static char *
174decoding_fgets(char *s, int size, struct tok_state *tok)
175{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177}
178
179static int
180decoding_feof(struct tok_state *tok)
181{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183}
184
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000185static char *
186decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000187{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189}
190
191#else /* PGEN */
192
193static char *
194error_ret(struct tok_state *tok) /* XXX */
195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 tok->decoding_erred = 1;
197 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200199 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200205static const char *
206get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000208 char buf[13];
209 int i;
210 for (i = 0; i < 12; i++) {
211 int c = s[i];
212 if (c == '\0')
213 break;
214 else if (c == '_')
215 buf[i] = '-';
216 else
217 buf[i] = tolower(c);
218 }
219 buf[i] = '\0';
220 if (strcmp(buf, "utf-8") == 0 ||
221 strncmp(buf, "utf-8-", 6) == 0)
222 return "utf-8";
223 else if (strcmp(buf, "latin-1") == 0 ||
224 strcmp(buf, "iso-8859-1") == 0 ||
225 strcmp(buf, "iso-latin-1") == 0 ||
226 strncmp(buf, "latin-1-", 8) == 0 ||
227 strncmp(buf, "iso-8859-1-", 11) == 0 ||
228 strncmp(buf, "iso-latin-1-", 12) == 0)
229 return "iso-8859-1";
230 else
231 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232}
233
234/* Return the coding spec in S, or NULL if none is found. */
235
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700236static int
237get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000238{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700240 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 /* Coding spec must be in a comment, and that comment must be
242 * the only statement on the source code line. */
243 for (i = 0; i < size - 6; i++) {
244 if (s[i] == '#')
245 break;
246 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 }
249 for (; i < size - 6; i++) { /* XXX inefficient search */
250 const char* t = s + i;
251 if (strncmp(t, "coding", 6) == 0) {
252 const char* begin = NULL;
253 t += 6;
254 if (t[0] != ':' && t[0] != '=')
255 continue;
256 do {
257 t++;
258 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 begin = t;
261 while (Py_ISALNUM(t[0]) ||
262 t[0] == '-' || t[0] == '_' || t[0] == '.')
263 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200267 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700268 if (!r)
269 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700270 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 if (r != q) {
272 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700273 r = new_string(q, strlen(q), tok);
274 if (!r)
275 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700277 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 }
279 }
280 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700281 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282}
283
284/* Check whether the line contains a coding spec. If it does,
285 invoke the set_readline function for the new encoding.
286 This function receives the tok_state and the new encoding.
287 Return 1 on success, 0 on failure. */
288
289static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000290check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000291 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700293 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000294 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000295
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200296 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000297 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200298 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000299 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200300 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700301 if (!get_coding_spec(line, &cs, size, tok))
302 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200303 if (!cs) {
304 Py_ssize_t i;
305 for (i = 0; i < size; i++) {
306 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
307 break;
308 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
309 /* Stop checking coding spec after a line containing
310 * anything except a comment. */
311 tok->read_coding_spec = 1;
312 break;
313 }
314 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700315 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200316 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700317 tok->read_coding_spec = 1;
318 if (tok->encoding == NULL) {
319 assert(tok->decoding_state == STATE_RAW);
320 if (strcmp(cs, "utf-8") == 0) {
321 tok->encoding = cs;
322 } else {
323 r = set_readline(tok, cs);
324 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700326 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000327 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700328 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300329 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700330 "encoding problem: %s", cs);
331 PyMem_FREE(cs);
332 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000333 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700334 } else { /* then, compare cs with BOM */
335 r = (strcmp(tok->encoding, cs) == 0);
336 if (!r)
337 PyErr_Format(PyExc_SyntaxError,
338 "encoding problem: %s with BOM", cs);
339 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000342}
343
344/* See whether the file starts with a BOM. If it does,
345 invoke the set_readline function with the new encoding.
346 Return 1 on success, 0 on failure. */
347
348static int
349check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 void unget_char(int, struct tok_state *),
351 int set_readline(struct tok_state *, const char *),
352 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000353{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000354 int ch1, ch2, ch3;
355 ch1 = get_char(tok);
356 tok->decoding_state = STATE_RAW;
357 if (ch1 == EOF) {
358 return 1;
359 } else if (ch1 == 0xEF) {
360 ch2 = get_char(tok);
361 if (ch2 != 0xBB) {
362 unget_char(ch2, tok);
363 unget_char(ch1, tok);
364 return 1;
365 }
366 ch3 = get_char(tok);
367 if (ch3 != 0xBF) {
368 unget_char(ch3, tok);
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000374 /* Disable support for UTF-16 BOMs until a decision
375 is made whether this needs to be supported. */
376 } else if (ch1 == 0xFE) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFF) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-be"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
386 } else if (ch1 == 0xFF) {
387 ch2 = get_char(tok);
388 if (ch2 != 0xFE) {
389 unget_char(ch2, tok);
390 unget_char(ch1, tok);
391 return 1;
392 }
393 if (!set_readline(tok, "utf-16-le"))
394 return 0;
395 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000396#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 } else {
398 unget_char(ch1, tok);
399 return 1;
400 }
401 if (tok->encoding != NULL)
402 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700403 tok->encoding = new_string("utf-8", 5, tok);
404 if (!tok->encoding)
405 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 /* No need to set_readline: input is already utf-8 */
407 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000408}
409
410/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000412
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413 On entry, tok->decoding_buffer will be one of:
414 1) NULL: need to call tok->decoding_readline to get a new line
415 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000416 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000417 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 (in the s buffer) to copy entire contents of the line read
419 by tok->decoding_readline. tok->decoding_buffer has the overflow.
420 In this case, fp_readl is called in a loop (with an expanded buffer)
421 until the buffer ends with a '\n' (or until the end of the file is
422 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000423*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000424
425static char *
426fp_readl(char *s, int size, struct tok_state *tok)
427{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000428 PyObject* bufobj;
429 const char *buf;
430 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000431
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000432 /* Ask for one less byte so we can terminate it */
433 assert(size > 0);
434 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000435
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000436 if (tok->decoding_buffer) {
437 bufobj = tok->decoding_buffer;
438 Py_INCREF(bufobj);
439 }
440 else
441 {
442 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
443 if (bufobj == NULL)
444 goto error;
445 }
446 if (PyUnicode_CheckExact(bufobj))
447 {
448 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
449 if (buf == NULL) {
450 goto error;
451 }
452 }
453 else
454 {
455 buf = PyByteArray_AsString(bufobj);
456 if (buf == NULL) {
457 goto error;
458 }
459 buflen = PyByteArray_GET_SIZE(bufobj);
460 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000461
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 Py_XDECREF(tok->decoding_buffer);
463 if (buflen > size) {
464 /* Too many chars, the rest goes into tok->decoding_buffer */
465 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
466 buflen-size);
467 if (tok->decoding_buffer == NULL)
468 goto error;
469 buflen = size;
470 }
471 else
472 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000473
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474 memcpy(s, buf, buflen);
475 s[buflen] = '\0';
476 if (buflen == 0) /* EOF */
477 s = NULL;
478 Py_DECREF(bufobj);
479 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000480
481error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 Py_XDECREF(bufobj);
483 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000484}
485
486/* Set the readline function for TOK to a StreamReader's
487 readline function. The StreamReader is named ENC.
488
489 This function is called from check_bom and check_coding_spec.
490
491 ENC is usually identical to the future value of tok->encoding,
492 except for the (currently unsupported) case of UTF-16.
493
494 Return 1 on success, 0 on failure. */
495
496static int
497fp_setreadl(struct tok_state *tok, const char* enc)
498{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200500 _Py_IDENTIFIER(open);
501 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000502 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200503 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505 io = PyImport_ImportModuleNoBlock("io");
506 if (io == NULL)
507 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000508
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200510 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100511 * position of tok->fp. If tok->fp was opened in text mode on Windows,
512 * its file position counts CRLF as one char and can't be directly mapped
513 * to the file offset for fd. Instead we step back one byte and read to
514 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200515 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100516 if (pos == -1 ||
517 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000518 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
519 goto cleanup;
520 }
521
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200522 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000523 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000524 if (stream == NULL)
525 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200527 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Serhiy Storchaka5a57ade2015-12-24 10:35:59 +0200528 Py_SETREF(tok->decoding_readline, readline);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100529 if (pos > 0) {
530 if (PyObject_CallObject(readline, NULL) == NULL) {
531 readline = NULL;
532 goto cleanup;
533 }
534 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000535
536 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 Py_XDECREF(stream);
538 Py_XDECREF(io);
539 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540}
541
542/* Fetch the next byte from TOK. */
543
544static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546}
547
548/* Unfetch the last byte back into TOK. */
549
550static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000554/* Check whether the characters at s start a valid
555 UTF-8 sequence. Return the number of characters forming
556 the sequence if yes, 0 if not. */
557static int valid_utf8(const unsigned char* s)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 int expected = 0;
560 int length;
561 if (*s < 0x80)
562 /* single-byte code */
563 return 1;
564 if (*s < 0xc0)
565 /* following byte */
566 return 0;
567 if (*s < 0xE0)
568 expected = 1;
569 else if (*s < 0xF0)
570 expected = 2;
571 else if (*s < 0xF8)
572 expected = 3;
573 else
574 return 0;
575 length = expected + 1;
576 for (; expected; expected--)
577 if (s[expected] < 0x80 || s[expected] >= 0xC0)
578 return 0;
579 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000580}
581
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582/* Read a line of input from TOK. Determine encoding
583 if necessary. */
584
585static char *
586decoding_fgets(char *s, int size, struct tok_state *tok)
587{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 char *line = NULL;
589 int badchar = 0;
590 for (;;) {
591 if (tok->decoding_state == STATE_NORMAL) {
592 /* We already have a codec associated with
593 this input. */
594 line = fp_readl(s, size, tok);
595 break;
596 } else if (tok->decoding_state == STATE_RAW) {
597 /* We want a 'raw' read. */
598 line = Py_UniversalNewlineFgets(s, size,
599 tok->fp, NULL);
600 break;
601 } else {
602 /* We have not yet determined the encoding.
603 If an encoding is found, use the file-pointer
604 reader functions from now on. */
605 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
606 return error_ret(tok);
607 assert(tok->decoding_state != STATE_INIT);
608 }
609 }
610 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
611 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
612 return error_ret(tok);
613 }
614 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 /* The default encoding is UTF-8, so make sure we don't have any
617 non-UTF-8 sequences in it. */
618 if (line && !tok->encoding) {
619 unsigned char *c;
620 int length;
621 for (c = (unsigned char *)line; *c; c += length)
622 if (!(length = valid_utf8(c))) {
623 badchar = *c;
624 break;
625 }
626 }
627 if (badchar) {
628 /* Need to add 1 to the line number, since this line
629 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200630 PyErr_Format(PyExc_SyntaxError,
631 "Non-UTF-8 code starting with '\\x%.2x' "
632 "in file %U on line %i, "
633 "but no encoding declared; "
634 "see http://python.org/dev/peps/pep-0263/ for details",
635 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 return error_ret(tok);
637 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640}
641
642static int
643decoding_feof(struct tok_state *tok)
644{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 if (tok->decoding_state != STATE_NORMAL) {
646 return feof(tok->fp);
647 } else {
648 PyObject* buf = tok->decoding_buffer;
649 if (buf == NULL) {
650 buf = PyObject_CallObject(tok->decoding_readline, NULL);
651 if (buf == NULL) {
652 error_ret(tok);
653 return 1;
654 } else {
655 tok->decoding_buffer = buf;
656 }
657 }
658 return PyObject_Length(buf) == 0;
659 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660}
661
662/* Fetch a byte from TOK, using the string buffer. */
663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000664static int
665buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667}
668
669/* Unfetch a byte from TOK, using the string buffer. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static void
672buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->str--;
674 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Set the readline function for TOK to ENC. For the string-based
678 tokenizer, this means to just record the encoding. */
679
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000680static int
681buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 tok->enc = enc;
683 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684}
685
686/* Return a UTF-8 encoding Python string object from the
687 C byte string STR, which is encoded with ENC. */
688
689static PyObject *
690translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *utf8;
692 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
693 if (buf == NULL)
694 return NULL;
695 utf8 = PyUnicode_AsUTF8String(buf);
696 Py_DECREF(buf);
697 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698}
699
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000700
701static char *
702translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200703 int skip_next_lf = 0;
704 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 char *buf, *current;
706 char c = '\0';
707 buf = PyMem_MALLOC(needed_length);
708 if (buf == NULL) {
709 tok->done = E_NOMEM;
710 return NULL;
711 }
712 for (current = buf; *s; s++, current++) {
713 c = *s;
714 if (skip_next_lf) {
715 skip_next_lf = 0;
716 if (c == '\n') {
717 c = *++s;
718 if (!c)
719 break;
720 }
721 }
722 if (c == '\r') {
723 skip_next_lf = 1;
724 c = '\n';
725 }
726 *current = c;
727 }
728 /* If this is exec input, add a newline to the end of the string if
729 there isn't one already. */
730 if (exec_input && c != '\n') {
731 *current = '\n';
732 current++;
733 }
734 *current = '\0';
735 final_length = current - buf + 1;
736 if (final_length < needed_length && final_length)
737 /* should never fail */
738 buf = PyMem_REALLOC(buf, final_length);
739 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000740}
741
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000742/* Decode a byte string STR for use as the buffer of TOK.
743 Look for encoding declarations inside STR, and record them
744 inside TOK. */
745
746static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000747decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 PyObject* utf8 = NULL;
750 const char *str;
751 const char *s;
752 const char *newl[2] = {NULL, NULL};
753 int lineno = 0;
754 tok->input = str = translate_newlines(input, single, tok);
755 if (str == NULL)
756 return NULL;
757 tok->enc = NULL;
758 tok->str = str;
759 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
760 return error_ret(tok);
761 str = tok->str; /* string after BOM if any */
762 assert(str);
763 if (tok->enc != NULL) {
764 utf8 = translate_into_utf8(str, tok->enc);
765 if (utf8 == NULL)
766 return error_ret(tok);
767 str = PyBytes_AsString(utf8);
768 }
769 for (s = str;; s++) {
770 if (*s == '\0') break;
771 else if (*s == '\n') {
772 assert(lineno < 2);
773 newl[lineno] = s;
774 lineno++;
775 if (lineno == 2) break;
776 }
777 }
778 tok->enc = NULL;
779 /* need to check line 1 and 2 separately since check_coding_spec
780 assumes a single line as input */
781 if (newl[0]) {
782 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
783 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200784 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
786 tok, buf_setreadl))
787 return error_ret(tok);
788 }
789 }
790 if (tok->enc != NULL) {
791 assert(utf8 == NULL);
792 utf8 = translate_into_utf8(str, tok->enc);
793 if (utf8 == NULL)
794 return error_ret(tok);
795 str = PyBytes_AS_STRING(utf8);
796 }
797 assert(tok->decoding_buffer == NULL);
798 tok->decoding_buffer = utf8; /* CAUTION */
799 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000800}
801
802#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803
804/* Set up tokenizer for string */
805
806struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000807PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 struct tok_state *tok = tok_new();
810 if (tok == NULL)
811 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300812 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 if (str == NULL) {
814 PyTokenizer_Free(tok);
815 return NULL;
816 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000817
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 /* XXX: constify members. */
819 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
820 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821}
822
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000823struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000824PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000825{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 struct tok_state *tok = tok_new();
827 if (tok == NULL)
828 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000829#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000831#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 if (str == NULL) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 tok->decoding_state = STATE_RAW;
837 tok->read_coding_spec = 1;
838 tok->enc = NULL;
839 tok->str = str;
840 tok->encoding = (char *)PyMem_MALLOC(6);
841 if (!tok->encoding) {
842 PyTokenizer_Free(tok);
843 return NULL;
844 }
845 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000846
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000847 /* XXX: constify members. */
848 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
849 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000850}
851
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000852/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853
854struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300855PyTokenizer_FromFile(FILE *fp, const char* enc,
856 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000858 struct tok_state *tok = tok_new();
859 if (tok == NULL)
860 return NULL;
861 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
862 PyTokenizer_Free(tok);
863 return NULL;
864 }
865 tok->cur = tok->inp = tok->buf;
866 tok->end = tok->buf + BUFSIZ;
867 tok->fp = fp;
868 tok->prompt = ps1;
869 tok->nextprompt = ps2;
870 if (enc != NULL) {
871 /* Must copy encoding declaration since it
872 gets copied into the parse tree. */
873 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
874 if (!tok->encoding) {
875 PyTokenizer_Free(tok);
876 return NULL;
877 }
878 strcpy(tok->encoding, enc);
879 tok->decoding_state = STATE_NORMAL;
880 }
881 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882}
883
884
885/* Free a tok_state structure */
886
887void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000888PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 if (tok->encoding != NULL)
891 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000892#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000893 Py_XDECREF(tok->decoding_readline);
894 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200895 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000896#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 if (tok->fp != NULL && tok->buf != NULL)
898 PyMem_FREE(tok->buf);
899 if (tok->input)
900 PyMem_FREE((char *)tok->input);
901 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000902}
903
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000904/* Get next char, updating state; error code goes into tok->done */
905
906static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200907tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000909 for (;;) {
910 if (tok->cur != tok->inp) {
911 return Py_CHARMASK(*tok->cur++); /* Fast path */
912 }
913 if (tok->done != E_OK)
914 return EOF;
915 if (tok->fp == NULL) {
916 char *end = strchr(tok->inp, '\n');
917 if (end != NULL)
918 end++;
919 else {
920 end = strchr(tok->inp, '\0');
921 if (end == tok->inp) {
922 tok->done = E_EOF;
923 return EOF;
924 }
925 }
926 if (tok->start == NULL)
927 tok->buf = tok->cur;
928 tok->line_start = tok->cur;
929 tok->lineno++;
930 tok->inp = end;
931 return Py_CHARMASK(*tok->cur++);
932 }
933 if (tok->prompt != NULL) {
934 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000935#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000936 if (newtok != NULL) {
937 char *translated = translate_newlines(newtok, 0, tok);
938 PyMem_FREE(newtok);
939 if (translated == NULL)
940 return EOF;
941 newtok = translated;
942 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000943 if (tok->encoding && newtok && *newtok) {
944 /* Recode to UTF-8 */
945 Py_ssize_t buflen;
946 const char* buf;
947 PyObject *u = translate_into_utf8(newtok, tok->encoding);
948 PyMem_FREE(newtok);
949 if (!u) {
950 tok->done = E_DECODE;
951 return EOF;
952 }
953 buflen = PyBytes_GET_SIZE(u);
954 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 newtok = PyMem_MALLOC(buflen+1);
956 strcpy(newtok, buf);
957 Py_DECREF(u);
958 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000959#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000960 if (tok->nextprompt != NULL)
961 tok->prompt = tok->nextprompt;
962 if (newtok == NULL)
963 tok->done = E_INTR;
964 else if (*newtok == '\0') {
965 PyMem_FREE(newtok);
966 tok->done = E_EOF;
967 }
968 else if (tok->start != NULL) {
969 size_t start = tok->start - tok->buf;
970 size_t oldlen = tok->cur - tok->buf;
971 size_t newlen = oldlen + strlen(newtok);
972 char *buf = tok->buf;
973 buf = (char *)PyMem_REALLOC(buf, newlen+1);
974 tok->lineno++;
975 if (buf == NULL) {
976 PyMem_FREE(tok->buf);
977 tok->buf = NULL;
978 PyMem_FREE(newtok);
979 tok->done = E_NOMEM;
980 return EOF;
981 }
982 tok->buf = buf;
983 tok->cur = tok->buf + oldlen;
984 tok->line_start = tok->cur;
985 strcpy(tok->buf + oldlen, newtok);
986 PyMem_FREE(newtok);
987 tok->inp = tok->buf + newlen;
988 tok->end = tok->inp + 1;
989 tok->start = tok->buf + start;
990 }
991 else {
992 tok->lineno++;
993 if (tok->buf != NULL)
994 PyMem_FREE(tok->buf);
995 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000996 tok->cur = tok->buf;
997 tok->line_start = tok->buf;
998 tok->inp = strchr(tok->buf, '\0');
999 tok->end = tok->inp + 1;
1000 }
1001 }
1002 else {
1003 int done = 0;
1004 Py_ssize_t cur = 0;
1005 char *pt;
1006 if (tok->start == NULL) {
1007 if (tok->buf == NULL) {
1008 tok->buf = (char *)
1009 PyMem_MALLOC(BUFSIZ);
1010 if (tok->buf == NULL) {
1011 tok->done = E_NOMEM;
1012 return EOF;
1013 }
1014 tok->end = tok->buf + BUFSIZ;
1015 }
1016 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1017 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001018 if (!tok->decoding_erred)
1019 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001020 done = 1;
1021 }
1022 else {
1023 tok->done = E_OK;
1024 tok->inp = strchr(tok->buf, '\0');
1025 done = tok->inp[-1] == '\n';
1026 }
1027 }
1028 else {
1029 cur = tok->cur - tok->buf;
1030 if (decoding_feof(tok)) {
1031 tok->done = E_EOF;
1032 done = 1;
1033 }
1034 else
1035 tok->done = E_OK;
1036 }
1037 tok->lineno++;
1038 /* Read until '\n' or EOF */
1039 while (!done) {
1040 Py_ssize_t curstart = tok->start == NULL ? -1 :
1041 tok->start - tok->buf;
1042 Py_ssize_t curvalid = tok->inp - tok->buf;
1043 Py_ssize_t newsize = curvalid + BUFSIZ;
1044 char *newbuf = tok->buf;
1045 newbuf = (char *)PyMem_REALLOC(newbuf,
1046 newsize);
1047 if (newbuf == NULL) {
1048 tok->done = E_NOMEM;
1049 tok->cur = tok->inp;
1050 return EOF;
1051 }
1052 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001053 tok->cur = tok->buf + cur;
1054 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 tok->inp = tok->buf + curvalid;
1056 tok->end = tok->buf + newsize;
1057 tok->start = curstart < 0 ? NULL :
1058 tok->buf + curstart;
1059 if (decoding_fgets(tok->inp,
1060 (int)(tok->end - tok->inp),
1061 tok) == NULL) {
1062 /* Break out early on decoding
1063 errors, as tok->buf will be NULL
1064 */
1065 if (tok->decoding_erred)
1066 return EOF;
1067 /* Last line does not end in \n,
1068 fake one */
1069 strcpy(tok->inp, "\n");
1070 }
1071 tok->inp = strchr(tok->inp, '\0');
1072 done = tok->inp[-1] == '\n';
1073 }
1074 if (tok->buf != NULL) {
1075 tok->cur = tok->buf + cur;
1076 tok->line_start = tok->cur;
1077 /* replace "\r\n" with "\n" */
1078 /* For Mac leave the \r, giving a syntax error */
1079 pt = tok->inp - 2;
1080 if (pt >= tok->buf && *pt == '\r') {
1081 *pt++ = '\n';
1082 *pt = '\0';
1083 tok->inp = pt;
1084 }
1085 }
1086 }
1087 if (tok->done != E_OK) {
1088 if (tok->prompt != NULL)
1089 PySys_WriteStderr("\n");
1090 tok->cur = tok->inp;
1091 return EOF;
1092 }
1093 }
1094 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001095}
1096
1097
1098/* Back-up one character */
1099
1100static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001101tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 if (c != EOF) {
1104 if (--tok->cur < tok->buf)
1105 Py_FatalError("tok_backup: beginning of buffer");
1106 if (*tok->cur != c)
1107 *tok->cur = c;
1108 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109}
1110
1111
1112/* Return the token corresponding to a single character */
1113
1114int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001115PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 switch (c) {
1118 case '(': return LPAR;
1119 case ')': return RPAR;
1120 case '[': return LSQB;
1121 case ']': return RSQB;
1122 case ':': return COLON;
1123 case ',': return COMMA;
1124 case ';': return SEMI;
1125 case '+': return PLUS;
1126 case '-': return MINUS;
1127 case '*': return STAR;
1128 case '/': return SLASH;
1129 case '|': return VBAR;
1130 case '&': return AMPER;
1131 case '<': return LESS;
1132 case '>': return GREATER;
1133 case '=': return EQUAL;
1134 case '.': return DOT;
1135 case '%': return PERCENT;
1136 case '{': return LBRACE;
1137 case '}': return RBRACE;
1138 case '^': return CIRCUMFLEX;
1139 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001140 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 default: return OP;
1142 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001143}
1144
1145
Guido van Rossumfbab9051991-10-20 20:25:03 +00001146int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001147PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001148{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 switch (c1) {
1150 case '=':
1151 switch (c2) {
1152 case '=': return EQEQUAL;
1153 }
1154 break;
1155 case '!':
1156 switch (c2) {
1157 case '=': return NOTEQUAL;
1158 }
1159 break;
1160 case '<':
1161 switch (c2) {
1162 case '>': return NOTEQUAL;
1163 case '=': return LESSEQUAL;
1164 case '<': return LEFTSHIFT;
1165 }
1166 break;
1167 case '>':
1168 switch (c2) {
1169 case '=': return GREATEREQUAL;
1170 case '>': return RIGHTSHIFT;
1171 }
1172 break;
1173 case '+':
1174 switch (c2) {
1175 case '=': return PLUSEQUAL;
1176 }
1177 break;
1178 case '-':
1179 switch (c2) {
1180 case '=': return MINEQUAL;
1181 case '>': return RARROW;
1182 }
1183 break;
1184 case '*':
1185 switch (c2) {
1186 case '*': return DOUBLESTAR;
1187 case '=': return STAREQUAL;
1188 }
1189 break;
1190 case '/':
1191 switch (c2) {
1192 case '/': return DOUBLESLASH;
1193 case '=': return SLASHEQUAL;
1194 }
1195 break;
1196 case '|':
1197 switch (c2) {
1198 case '=': return VBAREQUAL;
1199 }
1200 break;
1201 case '%':
1202 switch (c2) {
1203 case '=': return PERCENTEQUAL;
1204 }
1205 break;
1206 case '&':
1207 switch (c2) {
1208 case '=': return AMPEREQUAL;
1209 }
1210 break;
1211 case '^':
1212 switch (c2) {
1213 case '=': return CIRCUMFLEXEQUAL;
1214 }
1215 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001216 case '@':
1217 switch (c2) {
1218 case '=': return ATEQUAL;
1219 }
1220 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001221 }
1222 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001223}
1224
Thomas Wouters434d0822000-08-24 20:11:32 +00001225int
1226PyToken_ThreeChars(int c1, int c2, int c3)
1227{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 switch (c1) {
1229 case '<':
1230 switch (c2) {
1231 case '<':
1232 switch (c3) {
1233 case '=':
1234 return LEFTSHIFTEQUAL;
1235 }
1236 break;
1237 }
1238 break;
1239 case '>':
1240 switch (c2) {
1241 case '>':
1242 switch (c3) {
1243 case '=':
1244 return RIGHTSHIFTEQUAL;
1245 }
1246 break;
1247 }
1248 break;
1249 case '*':
1250 switch (c2) {
1251 case '*':
1252 switch (c3) {
1253 case '=':
1254 return DOUBLESTAREQUAL;
1255 }
1256 break;
1257 }
1258 break;
1259 case '/':
1260 switch (c2) {
1261 case '/':
1262 switch (c3) {
1263 case '=':
1264 return DOUBLESLASHEQUAL;
1265 }
1266 break;
1267 }
1268 break;
1269 case '.':
1270 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001271 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 switch (c3) {
1273 case '.':
1274 return ELLIPSIS;
1275 }
1276 break;
1277 }
1278 break;
1279 }
1280 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001281}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001282
Guido van Rossum926f13a1998-04-09 21:38:06 +00001283static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001284indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001285{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 if (tok->alterror) {
1287 tok->done = E_TABSPACE;
1288 tok->cur = tok->inp;
1289 return 1;
1290 }
1291 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001292#ifdef PGEN
1293 PySys_WriteStderr("inconsistent use of tabs and spaces "
1294 "in indentation\n");
1295#else
1296 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001298#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 tok->altwarning = 0;
1300 }
1301 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001302}
1303
Martin v. Löwis47383402007-08-15 07:32:56 +00001304#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001305#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001306#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307/* Verify that the identifier follows PEP 3131.
1308 All identifier strings are guaranteed to be "ready" unicode objects.
1309 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001310static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001311verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001312{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 PyObject *s;
1314 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001315 if (tok->decoding_erred)
1316 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1320 PyErr_Clear();
1321 tok->done = E_IDENTIFIER;
1322 } else {
1323 tok->done = E_ERROR;
1324 }
1325 return 0;
1326 }
1327 result = PyUnicode_IsIdentifier(s);
1328 Py_DECREF(s);
1329 if (result == 0)
1330 tok->done = E_IDENTIFIER;
1331 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001332}
1333#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001334
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335/* Get next token, after space stripping etc. */
1336
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001337static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001338tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001340 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001342
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001344 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 tok->start = NULL;
1346 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 /* Get indentation level */
1349 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001350 int col = 0;
1351 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352 tok->atbol = 0;
1353 for (;;) {
1354 c = tok_nextc(tok);
1355 if (c == ' ')
1356 col++, altcol++;
1357 else if (c == '\t') {
1358 col = (col/tok->tabsize + 1) * tok->tabsize;
1359 altcol = (altcol/tok->alttabsize + 1)
1360 * tok->alttabsize;
1361 }
1362 else if (c == '\014') /* Control-L (formfeed) */
1363 col = altcol = 0; /* For Emacs users */
1364 else
1365 break;
1366 }
1367 tok_backup(tok, c);
1368 if (c == '#' || c == '\n') {
1369 /* Lines with only whitespace and/or comments
1370 shouldn't affect the indentation and are
1371 not passed to the parser as NEWLINE tokens,
1372 except *totally* empty lines in interactive
1373 mode, which signal the end of a command group. */
1374 if (col == 0 && c == '\n' && tok->prompt != NULL)
1375 blankline = 0; /* Let it through */
1376 else
1377 blankline = 1; /* Ignore completely */
1378 /* We can't jump back right here since we still
1379 may need to skip to the end of a comment */
1380 }
1381 if (!blankline && tok->level == 0) {
1382 if (col == tok->indstack[tok->indent]) {
1383 /* No change */
1384 if (altcol != tok->altindstack[tok->indent]) {
1385 if (indenterror(tok))
1386 return ERRORTOKEN;
1387 }
1388 }
1389 else if (col > tok->indstack[tok->indent]) {
1390 /* Indent -- always one */
1391 if (tok->indent+1 >= MAXINDENT) {
1392 tok->done = E_TOODEEP;
1393 tok->cur = tok->inp;
1394 return ERRORTOKEN;
1395 }
1396 if (altcol <= tok->altindstack[tok->indent]) {
1397 if (indenterror(tok))
1398 return ERRORTOKEN;
1399 }
1400 tok->pendin++;
1401 tok->indstack[++tok->indent] = col;
1402 tok->altindstack[tok->indent] = altcol;
1403 }
1404 else /* col < tok->indstack[tok->indent] */ {
1405 /* Dedent -- any number, must be consistent */
1406 while (tok->indent > 0 &&
1407 col < tok->indstack[tok->indent]) {
1408 tok->pendin--;
1409 tok->indent--;
1410 }
1411 if (col != tok->indstack[tok->indent]) {
1412 tok->done = E_DEDENT;
1413 tok->cur = tok->inp;
1414 return ERRORTOKEN;
1415 }
1416 if (altcol != tok->altindstack[tok->indent]) {
1417 if (indenterror(tok))
1418 return ERRORTOKEN;
1419 }
1420 }
1421 }
1422 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426 /* Return pending indents/dedents */
1427 if (tok->pendin != 0) {
1428 if (tok->pendin < 0) {
1429 tok->pendin++;
1430 return DEDENT;
1431 }
1432 else {
1433 tok->pendin--;
1434 return INDENT;
1435 }
1436 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001437
Yury Selivanov96ec9342015-07-23 15:01:58 +03001438 if (tok->async_def
1439 && !blankline
1440 && tok->level == 0
1441 /* There was a NEWLINE after ASYNC DEF,
1442 so we're past the signature. */
1443 && tok->async_def_nl
1444 /* Current indentation level is less than where
1445 the async function was defined */
1446 && tok->async_def_indent >= tok->indent)
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001447 {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001448 tok->async_def = 0;
1449 tok->async_def_indent = 0;
1450 tok->async_def_nl = 0;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001451 }
1452
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001453 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 tok->start = NULL;
1455 /* Skip spaces */
1456 do {
1457 c = tok_nextc(tok);
1458 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 /* Set start of current token */
1461 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001463 /* Skip comment */
1464 if (c == '#')
1465 while (c != EOF && c != '\n')
1466 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001468 /* Check for EOF and errors now */
1469 if (c == EOF) {
1470 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1471 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 /* Identifier (most frequent token!) */
1474 nonascii = 0;
1475 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001476 /* Process b"", r"", u"", br"" and rb"" */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001477 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001478 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001479 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001480 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001481 /* Since this is a backwards compatibility support literal we don't
1482 want to support it in arbitrary order like byte literals. */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001483 else if (!(saw_b || saw_u || saw_r || saw_f) && (c == 'u' || c == 'U'))
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001484 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001485 /* ur"" and ru"" are not supported */
1486 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001487 saw_r = 1;
Eric V. Smith235a6f02015-09-19 14:51:32 -04001488 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F'))
1489 saw_f = 1;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001490 else
1491 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001492 c = tok_nextc(tok);
1493 if (c == '"' || c == '\'')
1494 goto letter_quote;
1495 }
1496 while (is_potential_identifier_char(c)) {
1497 if (c >= 128)
1498 nonascii = 1;
1499 c = tok_nextc(tok);
1500 }
1501 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001502 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001503 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 *p_start = tok->start;
1505 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001506
Yury Selivanov96ec9342015-07-23 15:01:58 +03001507 /* async/await parsing block. */
1508 if (tok->cur - tok->start == 5) {
1509 /* Current token length is 5. */
1510 if (tok->async_def) {
1511 /* We're inside an 'async def' function. */
1512 if (memcmp(tok->start, "async", 5) == 0)
1513 return ASYNC;
1514 if (memcmp(tok->start, "await", 5) == 0)
1515 return AWAIT;
Yury Selivanov75445082015-05-11 22:57:16 -04001516 }
Yury Selivanov96ec9342015-07-23 15:01:58 +03001517 else if (memcmp(tok->start, "async", 5) == 0) {
1518 /* The current token is 'async'.
1519 Look ahead one token.*/
Yury Selivanov8085b802015-05-18 12:50:52 -04001520
Yury Selivanov96ec9342015-07-23 15:01:58 +03001521 struct tok_state ahead_tok;
1522 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1523 int ahead_tok_kind;
Yury Selivanov8085b802015-05-18 12:50:52 -04001524
Yury Selivanov75445082015-05-11 22:57:16 -04001525 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
Yury Selivanov75445082015-05-11 22:57:16 -04001526 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
Yury Selivanov96ec9342015-07-23 15:01:58 +03001527 &ahead_tok_end);
Yury Selivanov75445082015-05-11 22:57:16 -04001528
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001529 if (ahead_tok_kind == NAME
1530 && ahead_tok.cur - ahead_tok.start == 3
1531 && memcmp(ahead_tok.start, "def", 3) == 0)
1532 {
1533 /* The next token is going to be 'def', so instead of
1534 returning 'async' NAME token, we return ASYNC. */
Yury Selivanov96ec9342015-07-23 15:01:58 +03001535 tok->async_def_indent = tok->indent;
1536 tok->async_def = 1;
Yury Selivanov75445082015-05-11 22:57:16 -04001537 return ASYNC;
1538 }
Yury Selivanov75445082015-05-11 22:57:16 -04001539 }
1540 }
1541
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 return NAME;
1543 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001544
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 /* Newline */
1546 if (c == '\n') {
1547 tok->atbol = 1;
1548 if (blankline || tok->level > 0)
1549 goto nextline;
1550 *p_start = tok->start;
1551 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1552 tok->cont_line = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +03001553 if (tok->async_def) {
1554 /* We're somewhere inside an 'async def' function, and
1555 we've encountered a NEWLINE after its signature. */
1556 tok->async_def_nl = 1;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001557 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 return NEWLINE;
1559 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001560
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 /* Period or number starting with period? */
1562 if (c == '.') {
1563 c = tok_nextc(tok);
1564 if (isdigit(c)) {
1565 goto fraction;
1566 } else if (c == '.') {
1567 c = tok_nextc(tok);
1568 if (c == '.') {
1569 *p_start = tok->start;
1570 *p_end = tok->cur;
1571 return ELLIPSIS;
1572 } else {
1573 tok_backup(tok, c);
1574 }
1575 tok_backup(tok, '.');
1576 } else {
1577 tok_backup(tok, c);
1578 }
1579 *p_start = tok->start;
1580 *p_end = tok->cur;
1581 return DOT;
1582 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001583
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 /* Number */
1585 if (isdigit(c)) {
1586 if (c == '0') {
1587 /* Hex, octal or binary -- maybe. */
1588 c = tok_nextc(tok);
1589 if (c == '.')
1590 goto fraction;
1591 if (c == 'j' || c == 'J')
1592 goto imaginary;
1593 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001594
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 /* Hex */
1596 c = tok_nextc(tok);
1597 if (!isxdigit(c)) {
1598 tok->done = E_TOKEN;
1599 tok_backup(tok, c);
1600 return ERRORTOKEN;
1601 }
1602 do {
1603 c = tok_nextc(tok);
1604 } while (isxdigit(c));
1605 }
1606 else if (c == 'o' || c == 'O') {
1607 /* Octal */
1608 c = tok_nextc(tok);
1609 if (c < '0' || c >= '8') {
1610 tok->done = E_TOKEN;
1611 tok_backup(tok, c);
1612 return ERRORTOKEN;
1613 }
1614 do {
1615 c = tok_nextc(tok);
1616 } while ('0' <= c && c < '8');
1617 }
1618 else if (c == 'b' || c == 'B') {
1619 /* Binary */
1620 c = tok_nextc(tok);
1621 if (c != '0' && c != '1') {
1622 tok->done = E_TOKEN;
1623 tok_backup(tok, c);
1624 return ERRORTOKEN;
1625 }
1626 do {
1627 c = tok_nextc(tok);
1628 } while (c == '0' || c == '1');
1629 }
1630 else {
1631 int nonzero = 0;
1632 /* maybe old-style octal; c is first char of it */
1633 /* in any case, allow '0' as a literal */
1634 while (c == '0')
1635 c = tok_nextc(tok);
1636 while (isdigit(c)) {
1637 nonzero = 1;
1638 c = tok_nextc(tok);
1639 }
1640 if (c == '.')
1641 goto fraction;
1642 else if (c == 'e' || c == 'E')
1643 goto exponent;
1644 else if (c == 'j' || c == 'J')
1645 goto imaginary;
1646 else if (nonzero) {
1647 tok->done = E_TOKEN;
1648 tok_backup(tok, c);
1649 return ERRORTOKEN;
1650 }
1651 }
1652 }
1653 else {
1654 /* Decimal */
1655 do {
1656 c = tok_nextc(tok);
1657 } while (isdigit(c));
1658 {
1659 /* Accept floating point numbers. */
1660 if (c == '.') {
1661 fraction:
1662 /* Fraction */
1663 do {
1664 c = tok_nextc(tok);
1665 } while (isdigit(c));
1666 }
1667 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001668 int e;
1669 exponent:
1670 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 /* Exponent part */
1672 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001673 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001675 if (!isdigit(c)) {
1676 tok->done = E_TOKEN;
1677 tok_backup(tok, c);
1678 return ERRORTOKEN;
1679 }
1680 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001682 tok_backup(tok, e);
1683 *p_start = tok->start;
1684 *p_end = tok->cur;
1685 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 }
1687 do {
1688 c = tok_nextc(tok);
1689 } while (isdigit(c));
1690 }
1691 if (c == 'j' || c == 'J')
1692 /* Imaginary part */
1693 imaginary:
1694 c = tok_nextc(tok);
1695 }
1696 }
1697 tok_backup(tok, c);
1698 *p_start = tok->start;
1699 *p_end = tok->cur;
1700 return NUMBER;
1701 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001702
1703 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 /* String */
1705 if (c == '\'' || c == '"') {
1706 int quote = c;
1707 int quote_size = 1; /* 1 or 3 */
1708 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001709
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 /* Find the quote size and start of string */
1711 c = tok_nextc(tok);
1712 if (c == quote) {
1713 c = tok_nextc(tok);
1714 if (c == quote)
1715 quote_size = 3;
1716 else
1717 end_quote_size = 1; /* empty string found */
1718 }
1719 if (c != quote)
1720 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001721
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 /* Get rest of string */
1723 while (end_quote_size != quote_size) {
1724 c = tok_nextc(tok);
1725 if (c == EOF) {
1726 if (quote_size == 3)
1727 tok->done = E_EOFS;
1728 else
1729 tok->done = E_EOLS;
1730 tok->cur = tok->inp;
1731 return ERRORTOKEN;
1732 }
1733 if (quote_size == 1 && c == '\n') {
1734 tok->done = E_EOLS;
1735 tok->cur = tok->inp;
1736 return ERRORTOKEN;
1737 }
1738 if (c == quote)
1739 end_quote_size += 1;
1740 else {
1741 end_quote_size = 0;
1742 if (c == '\\')
Eric V. Smith6408dc82015-09-12 18:53:36 -04001743 c = tok_nextc(tok); /* skip escaped char */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 }
1745 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001746
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 *p_start = tok->start;
1748 *p_end = tok->cur;
1749 return STRING;
1750 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001751
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 /* Line continuation */
1753 if (c == '\\') {
1754 c = tok_nextc(tok);
1755 if (c != '\n') {
1756 tok->done = E_LINECONT;
1757 tok->cur = tok->inp;
1758 return ERRORTOKEN;
1759 }
1760 tok->cont_line = 1;
1761 goto again; /* Read next line */
1762 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001763
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001764 /* Check for two-character token */
1765 {
1766 int c2 = tok_nextc(tok);
1767 int token = PyToken_TwoChars(c, c2);
1768 if (token != OP) {
1769 int c3 = tok_nextc(tok);
1770 int token3 = PyToken_ThreeChars(c, c2, c3);
1771 if (token3 != OP) {
1772 token = token3;
1773 } else {
1774 tok_backup(tok, c3);
1775 }
1776 *p_start = tok->start;
1777 *p_end = tok->cur;
1778 return token;
1779 }
1780 tok_backup(tok, c2);
1781 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001782
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 /* Keep track of parentheses nesting level */
1784 switch (c) {
1785 case '(':
1786 case '[':
1787 case '{':
1788 tok->level++;
1789 break;
1790 case ')':
1791 case ']':
1792 case '}':
1793 tok->level--;
1794 break;
1795 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001796
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001797 /* Punctuation character */
1798 *p_start = tok->start;
1799 *p_end = tok->cur;
1800 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001801}
1802
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001803int
1804PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1805{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 int result = tok_get(tok, p_start, p_end);
1807 if (tok->decoding_erred) {
1808 result = ERRORTOKEN;
1809 tok->done = E_DECODE;
1810 }
1811 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001812}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001813
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001814/* Get the encoding of a Python file. Check for the coding cookie and check if
1815 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001816
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001817 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1818 encoding in the first or second line of the file (in which case the encoding
1819 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001820
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001821 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1822 by the caller. */
1823
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001824char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001825PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001826{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001827 struct tok_state *tok;
1828 FILE *fp;
1829 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001830
Victor Stinnerdaf45552013-08-28 00:53:59 +02001831#ifndef PGEN
1832 fd = _Py_dup(fd);
1833#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001834 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001835#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001836 if (fd < 0) {
1837 return NULL;
1838 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001839
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001840 fp = fdopen(fd, "r");
1841 if (fp == NULL) {
1842 return NULL;
1843 }
1844 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1845 if (tok == NULL) {
1846 fclose(fp);
1847 return NULL;
1848 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001849#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001850 if (filename != NULL) {
1851 Py_INCREF(filename);
1852 tok->filename = filename;
1853 }
1854 else {
1855 tok->filename = PyUnicode_FromString("<string>");
1856 if (tok->filename == NULL) {
1857 fclose(fp);
1858 PyTokenizer_Free(tok);
1859 return encoding;
1860 }
1861 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001862#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001863 while (tok->lineno < 2 && tok->done == E_OK) {
1864 PyTokenizer_Get(tok, &p_start, &p_end);
1865 }
1866 fclose(fp);
1867 if (tok->encoding) {
1868 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1869 if (encoding)
1870 strcpy(encoding, tok->encoding);
1871 }
1872 PyTokenizer_Free(tok);
1873 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001874}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001875
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001876char *
1877PyTokenizer_FindEncoding(int fd)
1878{
1879 return PyTokenizer_FindEncodingFilename(fd, NULL);
1880}
1881
Guido van Rossum408027e1996-12-30 16:17:54 +00001882#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001883
1884void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001885tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001886{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001887 printf("%s", _PyParser_TokenNames[type]);
1888 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1889 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001890}
1891
1892#endif