blob: 04baeaf38adeb502e83efa49f701535075c56baf [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
109 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000116tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119 sizeof(struct tok_state));
120 if (tok == NULL)
121 return NULL;
122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123 tok->done = E_OK;
124 tok->fp = NULL;
125 tok->input = NULL;
126 tok->tabsize = TABSIZE;
127 tok->indent = 0;
128 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400129
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
134 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 tok->altwarning = 1;
136 tok->alterror = 1;
137 tok->alttabsize = 1;
138 tok->altindstack[0] = 0;
139 tok->decoding_state = STATE_INIT;
140 tok->decoding_erred = 0;
141 tok->read_coding_spec = 0;
142 tok->enc = NULL;
143 tok->encoding = NULL;
144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200146 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 tok->decoding_readline = NULL;
148 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000149#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300150
151 tok->async_def = 0;
152 tok->async_def_indent = 0;
153 tok->async_def_nl = 0;
154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156}
157
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 if (!result) {
163 tok->done = E_NOMEM;
164 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 memcpy(result, s, len);
167 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000169}
170
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000171#ifdef PGEN
172
173static char *
174decoding_fgets(char *s, int size, struct tok_state *tok)
175{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177}
178
179static int
180decoding_feof(struct tok_state *tok)
181{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183}
184
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000185static char *
186decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000187{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189}
190
191#else /* PGEN */
192
193static char *
194error_ret(struct tok_state *tok) /* XXX */
195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 tok->decoding_erred = 1;
197 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198 PyMem_FREE(tok->buf);
199 tok->buf = NULL;
200 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000201}
202
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000203
204static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000205get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 char buf[13];
208 int i;
209 for (i = 0; i < 12; i++) {
210 int c = s[i];
211 if (c == '\0')
212 break;
213 else if (c == '_')
214 buf[i] = '-';
215 else
216 buf[i] = tolower(c);
217 }
218 buf[i] = '\0';
219 if (strcmp(buf, "utf-8") == 0 ||
220 strncmp(buf, "utf-8-", 6) == 0)
221 return "utf-8";
222 else if (strcmp(buf, "latin-1") == 0 ||
223 strcmp(buf, "iso-8859-1") == 0 ||
224 strcmp(buf, "iso-latin-1") == 0 ||
225 strncmp(buf, "latin-1-", 8) == 0 ||
226 strncmp(buf, "iso-8859-1-", 11) == 0 ||
227 strncmp(buf, "iso-latin-1-", 12) == 0)
228 return "iso-8859-1";
229 else
230 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000231}
232
233/* Return the coding spec in S, or NULL if none is found. */
234
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700235static int
236get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700239 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 /* Coding spec must be in a comment, and that comment must be
241 * the only statement on the source code line. */
242 for (i = 0; i < size - 6; i++) {
243 if (s[i] == '#')
244 break;
245 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700246 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 }
248 for (; i < size - 6; i++) { /* XXX inefficient search */
249 const char* t = s + i;
250 if (strncmp(t, "coding", 6) == 0) {
251 const char* begin = NULL;
252 t += 6;
253 if (t[0] != ':' && t[0] != '=')
254 continue;
255 do {
256 t++;
257 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 begin = t;
260 while (Py_ISALNUM(t[0]) ||
261 t[0] == '-' || t[0] == '_' || t[0] == '.')
262 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000263
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000264 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700265 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700266 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 if (!r)
268 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700269 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000270 if (r != q) {
271 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700272 r = new_string(q, strlen(q), tok);
273 if (!r)
274 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700276 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000277 }
278 }
279 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700280 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281}
282
283/* Check whether the line contains a coding spec. If it does,
284 invoke the set_readline function for the new encoding.
285 This function receives the tok_state and the new encoding.
286 Return 1 on success, 0 on failure. */
287
288static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000289check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000291{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700292 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000293 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000294
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200295 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000296 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200297 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200299 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700300 if (!get_coding_spec(line, &cs, size, tok))
301 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200302 if (!cs) {
303 Py_ssize_t i;
304 for (i = 0; i < size; i++) {
305 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
306 break;
307 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
308 /* Stop checking coding spec after a line containing
309 * anything except a comment. */
310 tok->read_coding_spec = 1;
311 break;
312 }
313 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700314 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200315 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->read_coding_spec = 1;
317 if (tok->encoding == NULL) {
318 assert(tok->decoding_state == STATE_RAW);
319 if (strcmp(cs, "utf-8") == 0) {
320 tok->encoding = cs;
321 } else {
322 r = set_readline(tok, cs);
323 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700325 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700327 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300328 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700329 "encoding problem: %s", cs);
330 PyMem_FREE(cs);
331 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700333 } else { /* then, compare cs with BOM */
334 r = (strcmp(tok->encoding, cs) == 0);
335 if (!r)
336 PyErr_Format(PyExc_SyntaxError,
337 "encoding problem: %s with BOM", cs);
338 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000339 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341}
342
343/* See whether the file starts with a BOM. If it does,
344 invoke the set_readline function with the new encoding.
345 Return 1 on success, 0 on failure. */
346
347static int
348check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000349 void unget_char(int, struct tok_state *),
350 int set_readline(struct tok_state *, const char *),
351 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000353 int ch1, ch2, ch3;
354 ch1 = get_char(tok);
355 tok->decoding_state = STATE_RAW;
356 if (ch1 == EOF) {
357 return 1;
358 } else if (ch1 == 0xEF) {
359 ch2 = get_char(tok);
360 if (ch2 != 0xBB) {
361 unget_char(ch2, tok);
362 unget_char(ch1, tok);
363 return 1;
364 }
365 ch3 = get_char(tok);
366 if (ch3 != 0xBF) {
367 unget_char(ch3, tok);
368 unget_char(ch2, tok);
369 unget_char(ch1, tok);
370 return 1;
371 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 /* Disable support for UTF-16 BOMs until a decision
374 is made whether this needs to be supported. */
375 } else if (ch1 == 0xFE) {
376 ch2 = get_char(tok);
377 if (ch2 != 0xFF) {
378 unget_char(ch2, tok);
379 unget_char(ch1, tok);
380 return 1;
381 }
382 if (!set_readline(tok, "utf-16-be"))
383 return 0;
384 tok->decoding_state = STATE_NORMAL;
385 } else if (ch1 == 0xFF) {
386 ch2 = get_char(tok);
387 if (ch2 != 0xFE) {
388 unget_char(ch2, tok);
389 unget_char(ch1, tok);
390 return 1;
391 }
392 if (!set_readline(tok, "utf-16-le"))
393 return 0;
394 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 } else {
397 unget_char(ch1, tok);
398 return 1;
399 }
400 if (tok->encoding != NULL)
401 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700402 tok->encoding = new_string("utf-8", 5, tok);
403 if (!tok->encoding)
404 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 /* No need to set_readline: input is already utf-8 */
406 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000407}
408
409/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000410 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000412 On entry, tok->decoding_buffer will be one of:
413 1) NULL: need to call tok->decoding_readline to get a new line
414 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000415 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000416 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 (in the s buffer) to copy entire contents of the line read
418 by tok->decoding_readline. tok->decoding_buffer has the overflow.
419 In this case, fp_readl is called in a loop (with an expanded buffer)
420 until the buffer ends with a '\n' (or until the end of the file is
421 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000422*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423
424static char *
425fp_readl(char *s, int size, struct tok_state *tok)
426{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 PyObject* bufobj;
428 const char *buf;
429 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000430
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000431 /* Ask for one less byte so we can terminate it */
432 assert(size > 0);
433 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000434
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000435 if (tok->decoding_buffer) {
436 bufobj = tok->decoding_buffer;
437 Py_INCREF(bufobj);
438 }
439 else
440 {
441 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
442 if (bufobj == NULL)
443 goto error;
444 }
445 if (PyUnicode_CheckExact(bufobj))
446 {
447 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
448 if (buf == NULL) {
449 goto error;
450 }
451 }
452 else
453 {
454 buf = PyByteArray_AsString(bufobj);
455 if (buf == NULL) {
456 goto error;
457 }
458 buflen = PyByteArray_GET_SIZE(bufobj);
459 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000460
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000461 Py_XDECREF(tok->decoding_buffer);
462 if (buflen > size) {
463 /* Too many chars, the rest goes into tok->decoding_buffer */
464 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
465 buflen-size);
466 if (tok->decoding_buffer == NULL)
467 goto error;
468 buflen = size;
469 }
470 else
471 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000472
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 memcpy(s, buf, buflen);
474 s[buflen] = '\0';
475 if (buflen == 0) /* EOF */
476 s = NULL;
477 Py_DECREF(bufobj);
478 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000479
480error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 Py_XDECREF(bufobj);
482 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000483}
484
485/* Set the readline function for TOK to a StreamReader's
486 readline function. The StreamReader is named ENC.
487
488 This function is called from check_bom and check_coding_spec.
489
490 ENC is usually identical to the future value of tok->encoding,
491 except for the (currently unsupported) case of UTF-16.
492
493 Return 1 on success, 0 on failure. */
494
495static int
496fp_setreadl(struct tok_state *tok, const char* enc)
497{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200499 _Py_IDENTIFIER(open);
500 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000501 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200502 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000504 io = PyImport_ImportModuleNoBlock("io");
505 if (io == NULL)
506 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000507
Victor Stinner22a351a2010-10-14 12:04:34 +0000508 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200509 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100510 * position of tok->fp. If tok->fp was opened in text mode on Windows,
511 * its file position counts CRLF as one char and can't be directly mapped
512 * to the file offset for fd. Instead we step back one byte and read to
513 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200514 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100515 if (pos == -1 ||
516 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000517 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
518 goto cleanup;
519 }
520
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200521 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000522 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000523 if (stream == NULL)
524 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000525
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200527 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000528 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100529 if (pos > 0) {
530 if (PyObject_CallObject(readline, NULL) == NULL) {
531 readline = NULL;
532 goto cleanup;
533 }
534 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000535
536 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537 Py_XDECREF(stream);
538 Py_XDECREF(io);
539 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540}
541
542/* Fetch the next byte from TOK. */
543
544static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546}
547
548/* Unfetch the last byte back into TOK. */
549
550static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000554/* Check whether the characters at s start a valid
555 UTF-8 sequence. Return the number of characters forming
556 the sequence if yes, 0 if not. */
557static int valid_utf8(const unsigned char* s)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 int expected = 0;
560 int length;
561 if (*s < 0x80)
562 /* single-byte code */
563 return 1;
564 if (*s < 0xc0)
565 /* following byte */
566 return 0;
567 if (*s < 0xE0)
568 expected = 1;
569 else if (*s < 0xF0)
570 expected = 2;
571 else if (*s < 0xF8)
572 expected = 3;
573 else
574 return 0;
575 length = expected + 1;
576 for (; expected; expected--)
577 if (s[expected] < 0x80 || s[expected] >= 0xC0)
578 return 0;
579 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000580}
581
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582/* Read a line of input from TOK. Determine encoding
583 if necessary. */
584
585static char *
586decoding_fgets(char *s, int size, struct tok_state *tok)
587{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 char *line = NULL;
589 int badchar = 0;
590 for (;;) {
591 if (tok->decoding_state == STATE_NORMAL) {
592 /* We already have a codec associated with
593 this input. */
594 line = fp_readl(s, size, tok);
595 break;
596 } else if (tok->decoding_state == STATE_RAW) {
597 /* We want a 'raw' read. */
598 line = Py_UniversalNewlineFgets(s, size,
599 tok->fp, NULL);
600 break;
601 } else {
602 /* We have not yet determined the encoding.
603 If an encoding is found, use the file-pointer
604 reader functions from now on. */
605 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
606 return error_ret(tok);
607 assert(tok->decoding_state != STATE_INIT);
608 }
609 }
610 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
611 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
612 return error_ret(tok);
613 }
614 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 /* The default encoding is UTF-8, so make sure we don't have any
617 non-UTF-8 sequences in it. */
618 if (line && !tok->encoding) {
619 unsigned char *c;
620 int length;
621 for (c = (unsigned char *)line; *c; c += length)
622 if (!(length = valid_utf8(c))) {
623 badchar = *c;
624 break;
625 }
626 }
627 if (badchar) {
628 /* Need to add 1 to the line number, since this line
629 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200630 PyErr_Format(PyExc_SyntaxError,
631 "Non-UTF-8 code starting with '\\x%.2x' "
632 "in file %U on line %i, "
633 "but no encoding declared; "
634 "see http://python.org/dev/peps/pep-0263/ for details",
635 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 return error_ret(tok);
637 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640}
641
642static int
643decoding_feof(struct tok_state *tok)
644{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 if (tok->decoding_state != STATE_NORMAL) {
646 return feof(tok->fp);
647 } else {
648 PyObject* buf = tok->decoding_buffer;
649 if (buf == NULL) {
650 buf = PyObject_CallObject(tok->decoding_readline, NULL);
651 if (buf == NULL) {
652 error_ret(tok);
653 return 1;
654 } else {
655 tok->decoding_buffer = buf;
656 }
657 }
658 return PyObject_Length(buf) == 0;
659 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660}
661
662/* Fetch a byte from TOK, using the string buffer. */
663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000664static int
665buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667}
668
669/* Unfetch a byte from TOK, using the string buffer. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static void
672buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->str--;
674 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Set the readline function for TOK to ENC. For the string-based
678 tokenizer, this means to just record the encoding. */
679
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000680static int
681buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 tok->enc = enc;
683 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684}
685
686/* Return a UTF-8 encoding Python string object from the
687 C byte string STR, which is encoded with ENC. */
688
689static PyObject *
690translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *utf8;
692 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
693 if (buf == NULL)
694 return NULL;
695 utf8 = PyUnicode_AsUTF8String(buf);
696 Py_DECREF(buf);
697 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698}
699
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000700
701static char *
702translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200703 int skip_next_lf = 0;
704 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 char *buf, *current;
706 char c = '\0';
707 buf = PyMem_MALLOC(needed_length);
708 if (buf == NULL) {
709 tok->done = E_NOMEM;
710 return NULL;
711 }
712 for (current = buf; *s; s++, current++) {
713 c = *s;
714 if (skip_next_lf) {
715 skip_next_lf = 0;
716 if (c == '\n') {
717 c = *++s;
718 if (!c)
719 break;
720 }
721 }
722 if (c == '\r') {
723 skip_next_lf = 1;
724 c = '\n';
725 }
726 *current = c;
727 }
728 /* If this is exec input, add a newline to the end of the string if
729 there isn't one already. */
730 if (exec_input && c != '\n') {
731 *current = '\n';
732 current++;
733 }
734 *current = '\0';
735 final_length = current - buf + 1;
736 if (final_length < needed_length && final_length)
737 /* should never fail */
738 buf = PyMem_REALLOC(buf, final_length);
739 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000740}
741
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000742/* Decode a byte string STR for use as the buffer of TOK.
743 Look for encoding declarations inside STR, and record them
744 inside TOK. */
745
746static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000747decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 PyObject* utf8 = NULL;
750 const char *str;
751 const char *s;
752 const char *newl[2] = {NULL, NULL};
753 int lineno = 0;
754 tok->input = str = translate_newlines(input, single, tok);
755 if (str == NULL)
756 return NULL;
757 tok->enc = NULL;
758 tok->str = str;
759 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
760 return error_ret(tok);
761 str = tok->str; /* string after BOM if any */
762 assert(str);
763 if (tok->enc != NULL) {
764 utf8 = translate_into_utf8(str, tok->enc);
765 if (utf8 == NULL)
766 return error_ret(tok);
767 str = PyBytes_AsString(utf8);
768 }
769 for (s = str;; s++) {
770 if (*s == '\0') break;
771 else if (*s == '\n') {
772 assert(lineno < 2);
773 newl[lineno] = s;
774 lineno++;
775 if (lineno == 2) break;
776 }
777 }
778 tok->enc = NULL;
779 /* need to check line 1 and 2 separately since check_coding_spec
780 assumes a single line as input */
781 if (newl[0]) {
782 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
783 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200784 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
786 tok, buf_setreadl))
787 return error_ret(tok);
788 }
789 }
790 if (tok->enc != NULL) {
791 assert(utf8 == NULL);
792 utf8 = translate_into_utf8(str, tok->enc);
793 if (utf8 == NULL)
794 return error_ret(tok);
795 str = PyBytes_AS_STRING(utf8);
796 }
797 assert(tok->decoding_buffer == NULL);
798 tok->decoding_buffer = utf8; /* CAUTION */
799 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000800}
801
802#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803
804/* Set up tokenizer for string */
805
806struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000807PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 struct tok_state *tok = tok_new();
810 if (tok == NULL)
811 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300812 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 if (str == NULL) {
814 PyTokenizer_Free(tok);
815 return NULL;
816 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000817
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 /* XXX: constify members. */
819 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
820 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821}
822
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000823struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000824PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000825{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 struct tok_state *tok = tok_new();
827 if (tok == NULL)
828 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000829#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000831#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 if (str == NULL) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 tok->decoding_state = STATE_RAW;
837 tok->read_coding_spec = 1;
838 tok->enc = NULL;
839 tok->str = str;
840 tok->encoding = (char *)PyMem_MALLOC(6);
841 if (!tok->encoding) {
842 PyTokenizer_Free(tok);
843 return NULL;
844 }
845 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000846
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000847 /* XXX: constify members. */
848 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
849 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000850}
851
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000852/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853
854struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300855PyTokenizer_FromFile(FILE *fp, const char* enc,
856 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000858 struct tok_state *tok = tok_new();
859 if (tok == NULL)
860 return NULL;
861 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
862 PyTokenizer_Free(tok);
863 return NULL;
864 }
865 tok->cur = tok->inp = tok->buf;
866 tok->end = tok->buf + BUFSIZ;
867 tok->fp = fp;
868 tok->prompt = ps1;
869 tok->nextprompt = ps2;
870 if (enc != NULL) {
871 /* Must copy encoding declaration since it
872 gets copied into the parse tree. */
873 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
874 if (!tok->encoding) {
875 PyTokenizer_Free(tok);
876 return NULL;
877 }
878 strcpy(tok->encoding, enc);
879 tok->decoding_state = STATE_NORMAL;
880 }
881 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882}
883
884
885/* Free a tok_state structure */
886
887void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000888PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 if (tok->encoding != NULL)
891 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000892#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000893 Py_XDECREF(tok->decoding_readline);
894 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200895 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000896#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 if (tok->fp != NULL && tok->buf != NULL)
898 PyMem_FREE(tok->buf);
899 if (tok->input)
900 PyMem_FREE((char *)tok->input);
901 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000902}
903
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000904/* Get next char, updating state; error code goes into tok->done */
905
906static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200907tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000909 for (;;) {
910 if (tok->cur != tok->inp) {
911 return Py_CHARMASK(*tok->cur++); /* Fast path */
912 }
913 if (tok->done != E_OK)
914 return EOF;
915 if (tok->fp == NULL) {
916 char *end = strchr(tok->inp, '\n');
917 if (end != NULL)
918 end++;
919 else {
920 end = strchr(tok->inp, '\0');
921 if (end == tok->inp) {
922 tok->done = E_EOF;
923 return EOF;
924 }
925 }
926 if (tok->start == NULL)
927 tok->buf = tok->cur;
928 tok->line_start = tok->cur;
929 tok->lineno++;
930 tok->inp = end;
931 return Py_CHARMASK(*tok->cur++);
932 }
933 if (tok->prompt != NULL) {
934 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000935#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000936 if (newtok != NULL) {
937 char *translated = translate_newlines(newtok, 0, tok);
938 PyMem_FREE(newtok);
939 if (translated == NULL)
940 return EOF;
941 newtok = translated;
942 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000943 if (tok->encoding && newtok && *newtok) {
944 /* Recode to UTF-8 */
945 Py_ssize_t buflen;
946 const char* buf;
947 PyObject *u = translate_into_utf8(newtok, tok->encoding);
948 PyMem_FREE(newtok);
949 if (!u) {
950 tok->done = E_DECODE;
951 return EOF;
952 }
953 buflen = PyBytes_GET_SIZE(u);
954 buf = PyBytes_AS_STRING(u);
955 if (!buf) {
956 Py_DECREF(u);
957 tok->done = E_DECODE;
958 return EOF;
959 }
960 newtok = PyMem_MALLOC(buflen+1);
961 strcpy(newtok, buf);
962 Py_DECREF(u);
963 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000964#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000965 if (tok->nextprompt != NULL)
966 tok->prompt = tok->nextprompt;
967 if (newtok == NULL)
968 tok->done = E_INTR;
969 else if (*newtok == '\0') {
970 PyMem_FREE(newtok);
971 tok->done = E_EOF;
972 }
973 else if (tok->start != NULL) {
974 size_t start = tok->start - tok->buf;
975 size_t oldlen = tok->cur - tok->buf;
976 size_t newlen = oldlen + strlen(newtok);
977 char *buf = tok->buf;
978 buf = (char *)PyMem_REALLOC(buf, newlen+1);
979 tok->lineno++;
980 if (buf == NULL) {
981 PyMem_FREE(tok->buf);
982 tok->buf = NULL;
983 PyMem_FREE(newtok);
984 tok->done = E_NOMEM;
985 return EOF;
986 }
987 tok->buf = buf;
988 tok->cur = tok->buf + oldlen;
989 tok->line_start = tok->cur;
990 strcpy(tok->buf + oldlen, newtok);
991 PyMem_FREE(newtok);
992 tok->inp = tok->buf + newlen;
993 tok->end = tok->inp + 1;
994 tok->start = tok->buf + start;
995 }
996 else {
997 tok->lineno++;
998 if (tok->buf != NULL)
999 PyMem_FREE(tok->buf);
1000 tok->buf = newtok;
1001 tok->line_start = tok->buf;
1002 tok->cur = tok->buf;
1003 tok->line_start = tok->buf;
1004 tok->inp = strchr(tok->buf, '\0');
1005 tok->end = tok->inp + 1;
1006 }
1007 }
1008 else {
1009 int done = 0;
1010 Py_ssize_t cur = 0;
1011 char *pt;
1012 if (tok->start == NULL) {
1013 if (tok->buf == NULL) {
1014 tok->buf = (char *)
1015 PyMem_MALLOC(BUFSIZ);
1016 if (tok->buf == NULL) {
1017 tok->done = E_NOMEM;
1018 return EOF;
1019 }
1020 tok->end = tok->buf + BUFSIZ;
1021 }
1022 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1023 tok) == NULL) {
1024 tok->done = E_EOF;
1025 done = 1;
1026 }
1027 else {
1028 tok->done = E_OK;
1029 tok->inp = strchr(tok->buf, '\0');
1030 done = tok->inp[-1] == '\n';
1031 }
1032 }
1033 else {
1034 cur = tok->cur - tok->buf;
1035 if (decoding_feof(tok)) {
1036 tok->done = E_EOF;
1037 done = 1;
1038 }
1039 else
1040 tok->done = E_OK;
1041 }
1042 tok->lineno++;
1043 /* Read until '\n' or EOF */
1044 while (!done) {
1045 Py_ssize_t curstart = tok->start == NULL ? -1 :
1046 tok->start - tok->buf;
1047 Py_ssize_t curvalid = tok->inp - tok->buf;
1048 Py_ssize_t newsize = curvalid + BUFSIZ;
1049 char *newbuf = tok->buf;
1050 newbuf = (char *)PyMem_REALLOC(newbuf,
1051 newsize);
1052 if (newbuf == NULL) {
1053 tok->done = E_NOMEM;
1054 tok->cur = tok->inp;
1055 return EOF;
1056 }
1057 tok->buf = newbuf;
1058 tok->inp = tok->buf + curvalid;
1059 tok->end = tok->buf + newsize;
1060 tok->start = curstart < 0 ? NULL :
1061 tok->buf + curstart;
1062 if (decoding_fgets(tok->inp,
1063 (int)(tok->end - tok->inp),
1064 tok) == NULL) {
1065 /* Break out early on decoding
1066 errors, as tok->buf will be NULL
1067 */
1068 if (tok->decoding_erred)
1069 return EOF;
1070 /* Last line does not end in \n,
1071 fake one */
1072 strcpy(tok->inp, "\n");
1073 }
1074 tok->inp = strchr(tok->inp, '\0');
1075 done = tok->inp[-1] == '\n';
1076 }
1077 if (tok->buf != NULL) {
1078 tok->cur = tok->buf + cur;
1079 tok->line_start = tok->cur;
1080 /* replace "\r\n" with "\n" */
1081 /* For Mac leave the \r, giving a syntax error */
1082 pt = tok->inp - 2;
1083 if (pt >= tok->buf && *pt == '\r') {
1084 *pt++ = '\n';
1085 *pt = '\0';
1086 tok->inp = pt;
1087 }
1088 }
1089 }
1090 if (tok->done != E_OK) {
1091 if (tok->prompt != NULL)
1092 PySys_WriteStderr("\n");
1093 tok->cur = tok->inp;
1094 return EOF;
1095 }
1096 }
1097 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098}
1099
1100
1101/* Back-up one character */
1102
1103static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001104tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001105{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 if (c != EOF) {
1107 if (--tok->cur < tok->buf)
1108 Py_FatalError("tok_backup: beginning of buffer");
1109 if (*tok->cur != c)
1110 *tok->cur = c;
1111 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112}
1113
1114
1115/* Return the token corresponding to a single character */
1116
1117int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001118PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 switch (c) {
1121 case '(': return LPAR;
1122 case ')': return RPAR;
1123 case '[': return LSQB;
1124 case ']': return RSQB;
1125 case ':': return COLON;
1126 case ',': return COMMA;
1127 case ';': return SEMI;
1128 case '+': return PLUS;
1129 case '-': return MINUS;
1130 case '*': return STAR;
1131 case '/': return SLASH;
1132 case '|': return VBAR;
1133 case '&': return AMPER;
1134 case '<': return LESS;
1135 case '>': return GREATER;
1136 case '=': return EQUAL;
1137 case '.': return DOT;
1138 case '%': return PERCENT;
1139 case '{': return LBRACE;
1140 case '}': return RBRACE;
1141 case '^': return CIRCUMFLEX;
1142 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001143 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 default: return OP;
1145 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146}
1147
1148
Guido van Rossumfbab9051991-10-20 20:25:03 +00001149int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001150PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001152 switch (c1) {
1153 case '=':
1154 switch (c2) {
1155 case '=': return EQEQUAL;
1156 }
1157 break;
1158 case '!':
1159 switch (c2) {
1160 case '=': return NOTEQUAL;
1161 }
1162 break;
1163 case '<':
1164 switch (c2) {
1165 case '>': return NOTEQUAL;
1166 case '=': return LESSEQUAL;
1167 case '<': return LEFTSHIFT;
1168 }
1169 break;
1170 case '>':
1171 switch (c2) {
1172 case '=': return GREATEREQUAL;
1173 case '>': return RIGHTSHIFT;
1174 }
1175 break;
1176 case '+':
1177 switch (c2) {
1178 case '=': return PLUSEQUAL;
1179 }
1180 break;
1181 case '-':
1182 switch (c2) {
1183 case '=': return MINEQUAL;
1184 case '>': return RARROW;
1185 }
1186 break;
1187 case '*':
1188 switch (c2) {
1189 case '*': return DOUBLESTAR;
1190 case '=': return STAREQUAL;
1191 }
1192 break;
1193 case '/':
1194 switch (c2) {
1195 case '/': return DOUBLESLASH;
1196 case '=': return SLASHEQUAL;
1197 }
1198 break;
1199 case '|':
1200 switch (c2) {
1201 case '=': return VBAREQUAL;
1202 }
1203 break;
1204 case '%':
1205 switch (c2) {
1206 case '=': return PERCENTEQUAL;
1207 }
1208 break;
1209 case '&':
1210 switch (c2) {
1211 case '=': return AMPEREQUAL;
1212 }
1213 break;
1214 case '^':
1215 switch (c2) {
1216 case '=': return CIRCUMFLEXEQUAL;
1217 }
1218 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001219 case '@':
1220 switch (c2) {
1221 case '=': return ATEQUAL;
1222 }
1223 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 }
1225 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001226}
1227
Thomas Wouters434d0822000-08-24 20:11:32 +00001228int
1229PyToken_ThreeChars(int c1, int c2, int c3)
1230{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 switch (c1) {
1232 case '<':
1233 switch (c2) {
1234 case '<':
1235 switch (c3) {
1236 case '=':
1237 return LEFTSHIFTEQUAL;
1238 }
1239 break;
1240 }
1241 break;
1242 case '>':
1243 switch (c2) {
1244 case '>':
1245 switch (c3) {
1246 case '=':
1247 return RIGHTSHIFTEQUAL;
1248 }
1249 break;
1250 }
1251 break;
1252 case '*':
1253 switch (c2) {
1254 case '*':
1255 switch (c3) {
1256 case '=':
1257 return DOUBLESTAREQUAL;
1258 }
1259 break;
1260 }
1261 break;
1262 case '/':
1263 switch (c2) {
1264 case '/':
1265 switch (c3) {
1266 case '=':
1267 return DOUBLESLASHEQUAL;
1268 }
1269 break;
1270 }
1271 break;
1272 case '.':
1273 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001274 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 switch (c3) {
1276 case '.':
1277 return ELLIPSIS;
1278 }
1279 break;
1280 }
1281 break;
1282 }
1283 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001284}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001285
Guido van Rossum926f13a1998-04-09 21:38:06 +00001286static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001287indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001288{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 if (tok->alterror) {
1290 tok->done = E_TABSPACE;
1291 tok->cur = tok->inp;
1292 return 1;
1293 }
1294 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001295#ifdef PGEN
1296 PySys_WriteStderr("inconsistent use of tabs and spaces "
1297 "in indentation\n");
1298#else
1299 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001301#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 tok->altwarning = 0;
1303 }
1304 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001305}
1306
Martin v. Löwis47383402007-08-15 07:32:56 +00001307#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001308#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001309#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310/* Verify that the identifier follows PEP 3131.
1311 All identifier strings are guaranteed to be "ready" unicode objects.
1312 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001313static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001314verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001315{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 PyObject *s;
1317 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001318 if (tok->decoding_erred)
1319 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323 PyErr_Clear();
1324 tok->done = E_IDENTIFIER;
1325 } else {
1326 tok->done = E_ERROR;
1327 }
1328 return 0;
1329 }
1330 result = PyUnicode_IsIdentifier(s);
1331 Py_DECREF(s);
1332 if (result == 0)
1333 tok->done = E_IDENTIFIER;
1334 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001335}
1336#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001337
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338/* Get next token, after space stripping etc. */
1339
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001340static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001341tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001343 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001345
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001347 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 tok->start = NULL;
1349 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001350
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 /* Get indentation level */
1352 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001353 int col = 0;
1354 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355 tok->atbol = 0;
1356 for (;;) {
1357 c = tok_nextc(tok);
1358 if (c == ' ')
1359 col++, altcol++;
1360 else if (c == '\t') {
1361 col = (col/tok->tabsize + 1) * tok->tabsize;
1362 altcol = (altcol/tok->alttabsize + 1)
1363 * tok->alttabsize;
1364 }
1365 else if (c == '\014') /* Control-L (formfeed) */
1366 col = altcol = 0; /* For Emacs users */
1367 else
1368 break;
1369 }
1370 tok_backup(tok, c);
1371 if (c == '#' || c == '\n') {
1372 /* Lines with only whitespace and/or comments
1373 shouldn't affect the indentation and are
1374 not passed to the parser as NEWLINE tokens,
1375 except *totally* empty lines in interactive
1376 mode, which signal the end of a command group. */
1377 if (col == 0 && c == '\n' && tok->prompt != NULL)
1378 blankline = 0; /* Let it through */
1379 else
1380 blankline = 1; /* Ignore completely */
1381 /* We can't jump back right here since we still
1382 may need to skip to the end of a comment */
1383 }
1384 if (!blankline && tok->level == 0) {
1385 if (col == tok->indstack[tok->indent]) {
1386 /* No change */
1387 if (altcol != tok->altindstack[tok->indent]) {
1388 if (indenterror(tok))
1389 return ERRORTOKEN;
1390 }
1391 }
1392 else if (col > tok->indstack[tok->indent]) {
1393 /* Indent -- always one */
1394 if (tok->indent+1 >= MAXINDENT) {
1395 tok->done = E_TOODEEP;
1396 tok->cur = tok->inp;
1397 return ERRORTOKEN;
1398 }
1399 if (altcol <= tok->altindstack[tok->indent]) {
1400 if (indenterror(tok))
1401 return ERRORTOKEN;
1402 }
1403 tok->pendin++;
1404 tok->indstack[++tok->indent] = col;
1405 tok->altindstack[tok->indent] = altcol;
1406 }
1407 else /* col < tok->indstack[tok->indent] */ {
1408 /* Dedent -- any number, must be consistent */
1409 while (tok->indent > 0 &&
1410 col < tok->indstack[tok->indent]) {
1411 tok->pendin--;
1412 tok->indent--;
1413 }
1414 if (col != tok->indstack[tok->indent]) {
1415 tok->done = E_DEDENT;
1416 tok->cur = tok->inp;
1417 return ERRORTOKEN;
1418 }
1419 if (altcol != tok->altindstack[tok->indent]) {
1420 if (indenterror(tok))
1421 return ERRORTOKEN;
1422 }
1423 }
1424 }
1425 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 /* Return pending indents/dedents */
1430 if (tok->pendin != 0) {
1431 if (tok->pendin < 0) {
1432 tok->pendin++;
1433 return DEDENT;
1434 }
1435 else {
1436 tok->pendin--;
1437 return INDENT;
1438 }
1439 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Yury Selivanov96ec9342015-07-23 15:01:58 +03001441 if (tok->async_def
1442 && !blankline
1443 && tok->level == 0
1444 /* There was a NEWLINE after ASYNC DEF,
1445 so we're past the signature. */
1446 && tok->async_def_nl
1447 /* Current indentation level is less than where
1448 the async function was defined */
1449 && tok->async_def_indent >= tok->indent)
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001450 {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001451 tok->async_def = 0;
1452 tok->async_def_indent = 0;
1453 tok->async_def_nl = 0;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001454 }
1455
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001457 tok->start = NULL;
1458 /* Skip spaces */
1459 do {
1460 c = tok_nextc(tok);
1461 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001463 /* Set start of current token */
1464 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001465
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001466 /* Skip comment */
1467 if (c == '#')
1468 while (c != EOF && c != '\n')
1469 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 /* Check for EOF and errors now */
1472 if (c == EOF) {
1473 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1474 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001475
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001476 /* Identifier (most frequent token!) */
1477 nonascii = 0;
1478 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001479 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001480 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001481 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001482 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001483 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001484 /* Since this is a backwards compatibility support literal we don't
1485 want to support it in arbitrary order like byte literals. */
1486 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1487 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001488 /* ur"" and ru"" are not supported */
1489 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001490 saw_r = 1;
1491 else
1492 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 c = tok_nextc(tok);
1494 if (c == '"' || c == '\'')
1495 goto letter_quote;
1496 }
1497 while (is_potential_identifier_char(c)) {
1498 if (c >= 128)
1499 nonascii = 1;
1500 c = tok_nextc(tok);
1501 }
1502 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001503 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 *p_start = tok->start;
1506 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001507
Yury Selivanov96ec9342015-07-23 15:01:58 +03001508 /* async/await parsing block. */
1509 if (tok->cur - tok->start == 5) {
1510 /* Current token length is 5. */
1511 if (tok->async_def) {
1512 /* We're inside an 'async def' function. */
1513 if (memcmp(tok->start, "async", 5) == 0)
1514 return ASYNC;
1515 if (memcmp(tok->start, "await", 5) == 0)
1516 return AWAIT;
Yury Selivanov75445082015-05-11 22:57:16 -04001517 }
Yury Selivanov96ec9342015-07-23 15:01:58 +03001518 else if (memcmp(tok->start, "async", 5) == 0) {
1519 /* The current token is 'async'.
1520 Look ahead one token.*/
Yury Selivanov8085b802015-05-18 12:50:52 -04001521
Yury Selivanov96ec9342015-07-23 15:01:58 +03001522 struct tok_state ahead_tok;
1523 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1524 int ahead_tok_kind;
Yury Selivanov8085b802015-05-18 12:50:52 -04001525
Yury Selivanov75445082015-05-11 22:57:16 -04001526 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
Yury Selivanov75445082015-05-11 22:57:16 -04001527 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
Yury Selivanov96ec9342015-07-23 15:01:58 +03001528 &ahead_tok_end);
Yury Selivanov75445082015-05-11 22:57:16 -04001529
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001530 if (ahead_tok_kind == NAME
1531 && ahead_tok.cur - ahead_tok.start == 3
1532 && memcmp(ahead_tok.start, "def", 3) == 0)
1533 {
1534 /* The next token is going to be 'def', so instead of
1535 returning 'async' NAME token, we return ASYNC. */
Yury Selivanov96ec9342015-07-23 15:01:58 +03001536 tok->async_def_indent = tok->indent;
1537 tok->async_def = 1;
Yury Selivanov75445082015-05-11 22:57:16 -04001538 return ASYNC;
1539 }
Yury Selivanov75445082015-05-11 22:57:16 -04001540 }
1541 }
1542
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 return NAME;
1544 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546 /* Newline */
1547 if (c == '\n') {
1548 tok->atbol = 1;
1549 if (blankline || tok->level > 0)
1550 goto nextline;
1551 *p_start = tok->start;
1552 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1553 tok->cont_line = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +03001554 if (tok->async_def) {
1555 /* We're somewhere inside an 'async def' function, and
1556 we've encountered a NEWLINE after its signature. */
1557 tok->async_def_nl = 1;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001558 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 return NEWLINE;
1560 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001561
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001562 /* Period or number starting with period? */
1563 if (c == '.') {
1564 c = tok_nextc(tok);
1565 if (isdigit(c)) {
1566 goto fraction;
1567 } else if (c == '.') {
1568 c = tok_nextc(tok);
1569 if (c == '.') {
1570 *p_start = tok->start;
1571 *p_end = tok->cur;
1572 return ELLIPSIS;
1573 } else {
1574 tok_backup(tok, c);
1575 }
1576 tok_backup(tok, '.');
1577 } else {
1578 tok_backup(tok, c);
1579 }
1580 *p_start = tok->start;
1581 *p_end = tok->cur;
1582 return DOT;
1583 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001584
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 /* Number */
1586 if (isdigit(c)) {
1587 if (c == '0') {
1588 /* Hex, octal or binary -- maybe. */
1589 c = tok_nextc(tok);
1590 if (c == '.')
1591 goto fraction;
1592 if (c == 'j' || c == 'J')
1593 goto imaginary;
1594 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001595
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 /* Hex */
1597 c = tok_nextc(tok);
1598 if (!isxdigit(c)) {
1599 tok->done = E_TOKEN;
1600 tok_backup(tok, c);
1601 return ERRORTOKEN;
1602 }
1603 do {
1604 c = tok_nextc(tok);
1605 } while (isxdigit(c));
1606 }
1607 else if (c == 'o' || c == 'O') {
1608 /* Octal */
1609 c = tok_nextc(tok);
1610 if (c < '0' || c >= '8') {
1611 tok->done = E_TOKEN;
1612 tok_backup(tok, c);
1613 return ERRORTOKEN;
1614 }
1615 do {
1616 c = tok_nextc(tok);
1617 } while ('0' <= c && c < '8');
1618 }
1619 else if (c == 'b' || c == 'B') {
1620 /* Binary */
1621 c = tok_nextc(tok);
1622 if (c != '0' && c != '1') {
1623 tok->done = E_TOKEN;
1624 tok_backup(tok, c);
1625 return ERRORTOKEN;
1626 }
1627 do {
1628 c = tok_nextc(tok);
1629 } while (c == '0' || c == '1');
1630 }
1631 else {
1632 int nonzero = 0;
1633 /* maybe old-style octal; c is first char of it */
1634 /* in any case, allow '0' as a literal */
1635 while (c == '0')
1636 c = tok_nextc(tok);
1637 while (isdigit(c)) {
1638 nonzero = 1;
1639 c = tok_nextc(tok);
1640 }
1641 if (c == '.')
1642 goto fraction;
1643 else if (c == 'e' || c == 'E')
1644 goto exponent;
1645 else if (c == 'j' || c == 'J')
1646 goto imaginary;
1647 else if (nonzero) {
1648 tok->done = E_TOKEN;
1649 tok_backup(tok, c);
1650 return ERRORTOKEN;
1651 }
1652 }
1653 }
1654 else {
1655 /* Decimal */
1656 do {
1657 c = tok_nextc(tok);
1658 } while (isdigit(c));
1659 {
1660 /* Accept floating point numbers. */
1661 if (c == '.') {
1662 fraction:
1663 /* Fraction */
1664 do {
1665 c = tok_nextc(tok);
1666 } while (isdigit(c));
1667 }
1668 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001669 int e;
1670 exponent:
1671 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001672 /* Exponent part */
1673 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001674 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001676 if (!isdigit(c)) {
1677 tok->done = E_TOKEN;
1678 tok_backup(tok, c);
1679 return ERRORTOKEN;
1680 }
1681 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001683 tok_backup(tok, e);
1684 *p_start = tok->start;
1685 *p_end = tok->cur;
1686 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 }
1688 do {
1689 c = tok_nextc(tok);
1690 } while (isdigit(c));
1691 }
1692 if (c == 'j' || c == 'J')
1693 /* Imaginary part */
1694 imaginary:
1695 c = tok_nextc(tok);
1696 }
1697 }
1698 tok_backup(tok, c);
1699 *p_start = tok->start;
1700 *p_end = tok->cur;
1701 return NUMBER;
1702 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001703
1704 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 /* String */
1706 if (c == '\'' || c == '"') {
1707 int quote = c;
1708 int quote_size = 1; /* 1 or 3 */
1709 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001710
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 /* Find the quote size and start of string */
1712 c = tok_nextc(tok);
1713 if (c == quote) {
1714 c = tok_nextc(tok);
1715 if (c == quote)
1716 quote_size = 3;
1717 else
1718 end_quote_size = 1; /* empty string found */
1719 }
1720 if (c != quote)
1721 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001722
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 /* Get rest of string */
1724 while (end_quote_size != quote_size) {
1725 c = tok_nextc(tok);
1726 if (c == EOF) {
1727 if (quote_size == 3)
1728 tok->done = E_EOFS;
1729 else
1730 tok->done = E_EOLS;
1731 tok->cur = tok->inp;
1732 return ERRORTOKEN;
1733 }
1734 if (quote_size == 1 && c == '\n') {
1735 tok->done = E_EOLS;
1736 tok->cur = tok->inp;
1737 return ERRORTOKEN;
1738 }
1739 if (c == quote)
1740 end_quote_size += 1;
1741 else {
1742 end_quote_size = 0;
1743 if (c == '\\')
1744 c = tok_nextc(tok); /* skip escaped char */
1745 }
1746 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001747
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 *p_start = tok->start;
1749 *p_end = tok->cur;
1750 return STRING;
1751 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001752
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 /* Line continuation */
1754 if (c == '\\') {
1755 c = tok_nextc(tok);
1756 if (c != '\n') {
1757 tok->done = E_LINECONT;
1758 tok->cur = tok->inp;
1759 return ERRORTOKEN;
1760 }
1761 tok->cont_line = 1;
1762 goto again; /* Read next line */
1763 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001764
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 /* Check for two-character token */
1766 {
1767 int c2 = tok_nextc(tok);
1768 int token = PyToken_TwoChars(c, c2);
1769 if (token != OP) {
1770 int c3 = tok_nextc(tok);
1771 int token3 = PyToken_ThreeChars(c, c2, c3);
1772 if (token3 != OP) {
1773 token = token3;
1774 } else {
1775 tok_backup(tok, c3);
1776 }
1777 *p_start = tok->start;
1778 *p_end = tok->cur;
1779 return token;
1780 }
1781 tok_backup(tok, c2);
1782 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001783
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001784 /* Keep track of parentheses nesting level */
1785 switch (c) {
1786 case '(':
1787 case '[':
1788 case '{':
1789 tok->level++;
1790 break;
1791 case ')':
1792 case ']':
1793 case '}':
1794 tok->level--;
1795 break;
1796 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001797
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001798 /* Punctuation character */
1799 *p_start = tok->start;
1800 *p_end = tok->cur;
1801 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001802}
1803
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001804int
1805PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1806{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001807 int result = tok_get(tok, p_start, p_end);
1808 if (tok->decoding_erred) {
1809 result = ERRORTOKEN;
1810 tok->done = E_DECODE;
1811 }
1812 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001813}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001814
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001815/* Get the encoding of a Python file. Check for the coding cookie and check if
1816 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001817
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001818 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1819 encoding in the first or second line of the file (in which case the encoding
1820 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001821
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001822 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1823 by the caller. */
1824
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001825char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001826PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001827{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 struct tok_state *tok;
1829 FILE *fp;
1830 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001831
Victor Stinnerdaf45552013-08-28 00:53:59 +02001832#ifndef PGEN
1833 fd = _Py_dup(fd);
1834#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001835 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001836#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001837 if (fd < 0) {
1838 return NULL;
1839 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001840
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001841 fp = fdopen(fd, "r");
1842 if (fp == NULL) {
1843 return NULL;
1844 }
1845 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1846 if (tok == NULL) {
1847 fclose(fp);
1848 return NULL;
1849 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001850#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001851 if (filename != NULL) {
1852 Py_INCREF(filename);
1853 tok->filename = filename;
1854 }
1855 else {
1856 tok->filename = PyUnicode_FromString("<string>");
1857 if (tok->filename == NULL) {
1858 fclose(fp);
1859 PyTokenizer_Free(tok);
1860 return encoding;
1861 }
1862 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001863#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001864 while (tok->lineno < 2 && tok->done == E_OK) {
1865 PyTokenizer_Get(tok, &p_start, &p_end);
1866 }
1867 fclose(fp);
1868 if (tok->encoding) {
1869 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1870 if (encoding)
1871 strcpy(encoding, tok->encoding);
1872 }
1873 PyTokenizer_Free(tok);
1874 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001875}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001876
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001877char *
1878PyTokenizer_FindEncoding(int fd)
1879{
1880 return PyTokenizer_FindEncodingFilename(fd, NULL);
1881}
1882
Guido van Rossum408027e1996-12-30 16:17:54 +00001883#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001884
1885void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001886tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001887{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001888 printf("%s", _PyParser_TokenNames[type]);
1889 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1890 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001891}
1892
1893#endif