blob: d4476aea763aa3405f304920a26535fe41b7e45a [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
109 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000116tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119 sizeof(struct tok_state));
120 if (tok == NULL)
121 return NULL;
122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123 tok->done = E_OK;
124 tok->fp = NULL;
125 tok->input = NULL;
126 tok->tabsize = TABSIZE;
127 tok->indent = 0;
128 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400129
130 tok->def = 0;
131 tok->defstack[0] = 0;
132 tok->deftypestack[0] = 0;
133
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 tok->atbol = 1;
135 tok->pendin = 0;
136 tok->prompt = tok->nextprompt = NULL;
137 tok->lineno = 0;
138 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
143 tok->decoding_state = STATE_INIT;
144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
146 tok->enc = NULL;
147 tok->encoding = NULL;
148 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000149#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200150 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 tok->decoding_readline = NULL;
152 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000153#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000155}
156
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000157static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700158new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000159{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700161 if (!result) {
162 tok->done = E_NOMEM;
163 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000164 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700165 memcpy(result, s, len);
166 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000168}
169
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000170#ifdef PGEN
171
172static char *
173decoding_fgets(char *s, int size, struct tok_state *tok)
174{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176}
177
178static int
179decoding_feof(struct tok_state *tok)
180{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000181 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182}
183
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000184static char *
185decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700187 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188}
189
190#else /* PGEN */
191
192static char *
193error_ret(struct tok_state *tok) /* XXX */
194{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 tok->decoding_erred = 1;
196 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
197 PyMem_FREE(tok->buf);
198 tok->buf = NULL;
199 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000200}
201
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202
203static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000204get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000206 char buf[13];
207 int i;
208 for (i = 0; i < 12; i++) {
209 int c = s[i];
210 if (c == '\0')
211 break;
212 else if (c == '_')
213 buf[i] = '-';
214 else
215 buf[i] = tolower(c);
216 }
217 buf[i] = '\0';
218 if (strcmp(buf, "utf-8") == 0 ||
219 strncmp(buf, "utf-8-", 6) == 0)
220 return "utf-8";
221 else if (strcmp(buf, "latin-1") == 0 ||
222 strcmp(buf, "iso-8859-1") == 0 ||
223 strcmp(buf, "iso-latin-1") == 0 ||
224 strncmp(buf, "latin-1-", 8) == 0 ||
225 strncmp(buf, "iso-8859-1-", 11) == 0 ||
226 strncmp(buf, "iso-latin-1-", 12) == 0)
227 return "iso-8859-1";
228 else
229 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000230}
231
232/* Return the coding spec in S, or NULL if none is found. */
233
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700234static int
235get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000237 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700238 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 /* Coding spec must be in a comment, and that comment must be
240 * the only statement on the source code line. */
241 for (i = 0; i < size - 6; i++) {
242 if (s[i] == '#')
243 break;
244 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700245 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 }
247 for (; i < size - 6; i++) { /* XXX inefficient search */
248 const char* t = s + i;
249 if (strncmp(t, "coding", 6) == 0) {
250 const char* begin = NULL;
251 t += 6;
252 if (t[0] != ':' && t[0] != '=')
253 continue;
254 do {
255 t++;
256 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 begin = t;
259 while (Py_ISALNUM(t[0]) ||
260 t[0] == '-' || t[0] == '_' || t[0] == '.')
261 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700264 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700265 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 if (!r)
267 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700268 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 if (r != q) {
270 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 r = new_string(q, strlen(q), tok);
272 if (!r)
273 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700275 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 }
277 }
278 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700279 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280}
281
282/* Check whether the line contains a coding spec. If it does,
283 invoke the set_readline function for the new encoding.
284 This function receives the tok_state and the new encoding.
285 Return 1 on success, 0 on failure. */
286
287static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000288check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000293
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200294 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000295 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200296 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000297 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200298 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700299 if (!get_coding_spec(line, &cs, size, tok))
300 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200301 if (!cs) {
302 Py_ssize_t i;
303 for (i = 0; i < size; i++) {
304 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
305 break;
306 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
307 /* Stop checking coding spec after a line containing
308 * anything except a comment. */
309 tok->read_coding_spec = 1;
310 break;
311 }
312 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700313 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200314 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700315 tok->read_coding_spec = 1;
316 if (tok->encoding == NULL) {
317 assert(tok->decoding_state == STATE_RAW);
318 if (strcmp(cs, "utf-8") == 0) {
319 tok->encoding = cs;
320 } else {
321 r = set_readline(tok, cs);
322 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700326 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300327 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700328 "encoding problem: %s", cs);
329 PyMem_FREE(cs);
330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700332 } else { /* then, compare cs with BOM */
333 r = (strcmp(tok->encoding, cs) == 0);
334 if (!r)
335 PyErr_Format(PyExc_SyntaxError,
336 "encoding problem: %s with BOM", cs);
337 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000339 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340}
341
342/* See whether the file starts with a BOM. If it does,
343 invoke the set_readline function with the new encoding.
344 Return 1 on success, 0 on failure. */
345
346static int
347check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 void unget_char(int, struct tok_state *),
349 int set_readline(struct tok_state *, const char *),
350 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 int ch1, ch2, ch3;
353 ch1 = get_char(tok);
354 tok->decoding_state = STATE_RAW;
355 if (ch1 == EOF) {
356 return 1;
357 } else if (ch1 == 0xEF) {
358 ch2 = get_char(tok);
359 if (ch2 != 0xBB) {
360 unget_char(ch2, tok);
361 unget_char(ch1, tok);
362 return 1;
363 }
364 ch3 = get_char(tok);
365 if (ch3 != 0xBF) {
366 unget_char(ch3, tok);
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 /* Disable support for UTF-16 BOMs until a decision
373 is made whether this needs to be supported. */
374 } else if (ch1 == 0xFE) {
375 ch2 = get_char(tok);
376 if (ch2 != 0xFF) {
377 unget_char(ch2, tok);
378 unget_char(ch1, tok);
379 return 1;
380 }
381 if (!set_readline(tok, "utf-16-be"))
382 return 0;
383 tok->decoding_state = STATE_NORMAL;
384 } else if (ch1 == 0xFF) {
385 ch2 = get_char(tok);
386 if (ch2 != 0xFE) {
387 unget_char(ch2, tok);
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (!set_readline(tok, "utf-16-le"))
392 return 0;
393 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 } else {
396 unget_char(ch1, tok);
397 return 1;
398 }
399 if (tok->encoding != NULL)
400 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700401 tok->encoding = new_string("utf-8", 5, tok);
402 if (!tok->encoding)
403 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 /* No need to set_readline: input is already utf-8 */
405 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000406}
407
408/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000410
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411 On entry, tok->decoding_buffer will be one of:
412 1) NULL: need to call tok->decoding_readline to get a new line
413 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000414 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000415 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000416 (in the s buffer) to copy entire contents of the line read
417 by tok->decoding_readline. tok->decoding_buffer has the overflow.
418 In this case, fp_readl is called in a loop (with an expanded buffer)
419 until the buffer ends with a '\n' (or until the end of the file is
420 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000422
423static char *
424fp_readl(char *s, int size, struct tok_state *tok)
425{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyObject* bufobj;
427 const char *buf;
428 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000429
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 /* Ask for one less byte so we can terminate it */
431 assert(size > 0);
432 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000434 if (tok->decoding_buffer) {
435 bufobj = tok->decoding_buffer;
436 Py_INCREF(bufobj);
437 }
438 else
439 {
440 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
441 if (bufobj == NULL)
442 goto error;
443 }
444 if (PyUnicode_CheckExact(bufobj))
445 {
446 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
447 if (buf == NULL) {
448 goto error;
449 }
450 }
451 else
452 {
453 buf = PyByteArray_AsString(bufobj);
454 if (buf == NULL) {
455 goto error;
456 }
457 buflen = PyByteArray_GET_SIZE(bufobj);
458 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000460 Py_XDECREF(tok->decoding_buffer);
461 if (buflen > size) {
462 /* Too many chars, the rest goes into tok->decoding_buffer */
463 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
464 buflen-size);
465 if (tok->decoding_buffer == NULL)
466 goto error;
467 buflen = size;
468 }
469 else
470 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000471
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 memcpy(s, buf, buflen);
473 s[buflen] = '\0';
474 if (buflen == 0) /* EOF */
475 s = NULL;
476 Py_DECREF(bufobj);
477 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000478
479error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 Py_XDECREF(bufobj);
481 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482}
483
484/* Set the readline function for TOK to a StreamReader's
485 readline function. The StreamReader is named ENC.
486
487 This function is called from check_bom and check_coding_spec.
488
489 ENC is usually identical to the future value of tok->encoding,
490 except for the (currently unsupported) case of UTF-16.
491
492 Return 1 on success, 0 on failure. */
493
494static int
495fp_setreadl(struct tok_state *tok, const char* enc)
496{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200498 _Py_IDENTIFIER(open);
499 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000500 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200501 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000502
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 io = PyImport_ImportModuleNoBlock("io");
504 if (io == NULL)
505 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000506
Victor Stinner22a351a2010-10-14 12:04:34 +0000507 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200508 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100509 * position of tok->fp. If tok->fp was opened in text mode on Windows,
510 * its file position counts CRLF as one char and can't be directly mapped
511 * to the file offset for fd. Instead we step back one byte and read to
512 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200513 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100514 if (pos == -1 ||
515 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000516 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
517 goto cleanup;
518 }
519
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200520 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000521 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 if (stream == NULL)
523 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000525 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200526 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000527 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100528 if (pos > 0) {
529 if (PyObject_CallObject(readline, NULL) == NULL) {
530 readline = NULL;
531 goto cleanup;
532 }
533 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000534
535 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 Py_XDECREF(stream);
537 Py_XDECREF(io);
538 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000539}
540
541/* Fetch the next byte from TOK. */
542
543static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000544 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Unfetch the last byte back into TOK. */
548
549static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551}
552
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000553/* Check whether the characters at s start a valid
554 UTF-8 sequence. Return the number of characters forming
555 the sequence if yes, 0 if not. */
556static int valid_utf8(const unsigned char* s)
557{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000558 int expected = 0;
559 int length;
560 if (*s < 0x80)
561 /* single-byte code */
562 return 1;
563 if (*s < 0xc0)
564 /* following byte */
565 return 0;
566 if (*s < 0xE0)
567 expected = 1;
568 else if (*s < 0xF0)
569 expected = 2;
570 else if (*s < 0xF8)
571 expected = 3;
572 else
573 return 0;
574 length = expected + 1;
575 for (; expected; expected--)
576 if (s[expected] < 0x80 || s[expected] >= 0xC0)
577 return 0;
578 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000579}
580
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581/* Read a line of input from TOK. Determine encoding
582 if necessary. */
583
584static char *
585decoding_fgets(char *s, int size, struct tok_state *tok)
586{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 char *line = NULL;
588 int badchar = 0;
589 for (;;) {
590 if (tok->decoding_state == STATE_NORMAL) {
591 /* We already have a codec associated with
592 this input. */
593 line = fp_readl(s, size, tok);
594 break;
595 } else if (tok->decoding_state == STATE_RAW) {
596 /* We want a 'raw' read. */
597 line = Py_UniversalNewlineFgets(s, size,
598 tok->fp, NULL);
599 break;
600 } else {
601 /* We have not yet determined the encoding.
602 If an encoding is found, use the file-pointer
603 reader functions from now on. */
604 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
605 return error_ret(tok);
606 assert(tok->decoding_state != STATE_INIT);
607 }
608 }
609 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
610 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
611 return error_ret(tok);
612 }
613 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000614#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 /* The default encoding is UTF-8, so make sure we don't have any
616 non-UTF-8 sequences in it. */
617 if (line && !tok->encoding) {
618 unsigned char *c;
619 int length;
620 for (c = (unsigned char *)line; *c; c += length)
621 if (!(length = valid_utf8(c))) {
622 badchar = *c;
623 break;
624 }
625 }
626 if (badchar) {
627 /* Need to add 1 to the line number, since this line
628 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200629 PyErr_Format(PyExc_SyntaxError,
630 "Non-UTF-8 code starting with '\\x%.2x' "
631 "in file %U on line %i, "
632 "but no encoding declared; "
633 "see http://python.org/dev/peps/pep-0263/ for details",
634 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 return error_ret(tok);
636 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000638 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639}
640
641static int
642decoding_feof(struct tok_state *tok)
643{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 if (tok->decoding_state != STATE_NORMAL) {
645 return feof(tok->fp);
646 } else {
647 PyObject* buf = tok->decoding_buffer;
648 if (buf == NULL) {
649 buf = PyObject_CallObject(tok->decoding_readline, NULL);
650 if (buf == NULL) {
651 error_ret(tok);
652 return 1;
653 } else {
654 tok->decoding_buffer = buf;
655 }
656 }
657 return PyObject_Length(buf) == 0;
658 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659}
660
661/* Fetch a byte from TOK, using the string buffer. */
662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000663static int
664buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666}
667
668/* Unfetch a byte from TOK, using the string buffer. */
669
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000670static void
671buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 tok->str--;
673 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000674}
675
676/* Set the readline function for TOK to ENC. For the string-based
677 tokenizer, this means to just record the encoding. */
678
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000679static int
680buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000681 tok->enc = enc;
682 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683}
684
685/* Return a UTF-8 encoding Python string object from the
686 C byte string STR, which is encoded with ENC. */
687
688static PyObject *
689translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 PyObject *utf8;
691 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
692 if (buf == NULL)
693 return NULL;
694 utf8 = PyUnicode_AsUTF8String(buf);
695 Py_DECREF(buf);
696 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000697}
698
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000699
700static char *
701translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200702 int skip_next_lf = 0;
703 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 char *buf, *current;
705 char c = '\0';
706 buf = PyMem_MALLOC(needed_length);
707 if (buf == NULL) {
708 tok->done = E_NOMEM;
709 return NULL;
710 }
711 for (current = buf; *s; s++, current++) {
712 c = *s;
713 if (skip_next_lf) {
714 skip_next_lf = 0;
715 if (c == '\n') {
716 c = *++s;
717 if (!c)
718 break;
719 }
720 }
721 if (c == '\r') {
722 skip_next_lf = 1;
723 c = '\n';
724 }
725 *current = c;
726 }
727 /* If this is exec input, add a newline to the end of the string if
728 there isn't one already. */
729 if (exec_input && c != '\n') {
730 *current = '\n';
731 current++;
732 }
733 *current = '\0';
734 final_length = current - buf + 1;
735 if (final_length < needed_length && final_length)
736 /* should never fail */
737 buf = PyMem_REALLOC(buf, final_length);
738 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000739}
740
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000741/* Decode a byte string STR for use as the buffer of TOK.
742 Look for encoding declarations inside STR, and record them
743 inside TOK. */
744
745static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000746decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000747{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 PyObject* utf8 = NULL;
749 const char *str;
750 const char *s;
751 const char *newl[2] = {NULL, NULL};
752 int lineno = 0;
753 tok->input = str = translate_newlines(input, single, tok);
754 if (str == NULL)
755 return NULL;
756 tok->enc = NULL;
757 tok->str = str;
758 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
759 return error_ret(tok);
760 str = tok->str; /* string after BOM if any */
761 assert(str);
762 if (tok->enc != NULL) {
763 utf8 = translate_into_utf8(str, tok->enc);
764 if (utf8 == NULL)
765 return error_ret(tok);
766 str = PyBytes_AsString(utf8);
767 }
768 for (s = str;; s++) {
769 if (*s == '\0') break;
770 else if (*s == '\n') {
771 assert(lineno < 2);
772 newl[lineno] = s;
773 lineno++;
774 if (lineno == 2) break;
775 }
776 }
777 tok->enc = NULL;
778 /* need to check line 1 and 2 separately since check_coding_spec
779 assumes a single line as input */
780 if (newl[0]) {
781 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
782 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200783 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
785 tok, buf_setreadl))
786 return error_ret(tok);
787 }
788 }
789 if (tok->enc != NULL) {
790 assert(utf8 == NULL);
791 utf8 = translate_into_utf8(str, tok->enc);
792 if (utf8 == NULL)
793 return error_ret(tok);
794 str = PyBytes_AS_STRING(utf8);
795 }
796 assert(tok->decoding_buffer == NULL);
797 tok->decoding_buffer = utf8; /* CAUTION */
798 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000799}
800
801#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802
803/* Set up tokenizer for string */
804
805struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000806PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 struct tok_state *tok = tok_new();
809 if (tok == NULL)
810 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300811 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 if (str == NULL) {
813 PyTokenizer_Free(tok);
814 return NULL;
815 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000816
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 /* XXX: constify members. */
818 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
819 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820}
821
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000822struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000823PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000824{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 struct tok_state *tok = tok_new();
826 if (tok == NULL)
827 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000828#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000830#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 if (str == NULL) {
832 PyTokenizer_Free(tok);
833 return NULL;
834 }
835 tok->decoding_state = STATE_RAW;
836 tok->read_coding_spec = 1;
837 tok->enc = NULL;
838 tok->str = str;
839 tok->encoding = (char *)PyMem_MALLOC(6);
840 if (!tok->encoding) {
841 PyTokenizer_Free(tok);
842 return NULL;
843 }
844 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000845
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 /* XXX: constify members. */
847 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
848 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000849}
850
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000851/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852
853struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300854PyTokenizer_FromFile(FILE *fp, const char* enc,
855 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 struct tok_state *tok = tok_new();
858 if (tok == NULL)
859 return NULL;
860 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
861 PyTokenizer_Free(tok);
862 return NULL;
863 }
864 tok->cur = tok->inp = tok->buf;
865 tok->end = tok->buf + BUFSIZ;
866 tok->fp = fp;
867 tok->prompt = ps1;
868 tok->nextprompt = ps2;
869 if (enc != NULL) {
870 /* Must copy encoding declaration since it
871 gets copied into the parse tree. */
872 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
873 if (!tok->encoding) {
874 PyTokenizer_Free(tok);
875 return NULL;
876 }
877 strcpy(tok->encoding, enc);
878 tok->decoding_state = STATE_NORMAL;
879 }
880 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881}
882
883
884/* Free a tok_state structure */
885
886void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000887PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000888{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (tok->encoding != NULL)
890 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000891#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892 Py_XDECREF(tok->decoding_readline);
893 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200894 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000895#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 if (tok->fp != NULL && tok->buf != NULL)
897 PyMem_FREE(tok->buf);
898 if (tok->input)
899 PyMem_FREE((char *)tok->input);
900 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000901}
902
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000903/* Get next char, updating state; error code goes into tok->done */
904
905static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200906tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908 for (;;) {
909 if (tok->cur != tok->inp) {
910 return Py_CHARMASK(*tok->cur++); /* Fast path */
911 }
912 if (tok->done != E_OK)
913 return EOF;
914 if (tok->fp == NULL) {
915 char *end = strchr(tok->inp, '\n');
916 if (end != NULL)
917 end++;
918 else {
919 end = strchr(tok->inp, '\0');
920 if (end == tok->inp) {
921 tok->done = E_EOF;
922 return EOF;
923 }
924 }
925 if (tok->start == NULL)
926 tok->buf = tok->cur;
927 tok->line_start = tok->cur;
928 tok->lineno++;
929 tok->inp = end;
930 return Py_CHARMASK(*tok->cur++);
931 }
932 if (tok->prompt != NULL) {
933 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000934#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000935 if (newtok != NULL) {
936 char *translated = translate_newlines(newtok, 0, tok);
937 PyMem_FREE(newtok);
938 if (translated == NULL)
939 return EOF;
940 newtok = translated;
941 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000942 if (tok->encoding && newtok && *newtok) {
943 /* Recode to UTF-8 */
944 Py_ssize_t buflen;
945 const char* buf;
946 PyObject *u = translate_into_utf8(newtok, tok->encoding);
947 PyMem_FREE(newtok);
948 if (!u) {
949 tok->done = E_DECODE;
950 return EOF;
951 }
952 buflen = PyBytes_GET_SIZE(u);
953 buf = PyBytes_AS_STRING(u);
954 if (!buf) {
955 Py_DECREF(u);
956 tok->done = E_DECODE;
957 return EOF;
958 }
959 newtok = PyMem_MALLOC(buflen+1);
960 strcpy(newtok, buf);
961 Py_DECREF(u);
962 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000963#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 if (tok->nextprompt != NULL)
965 tok->prompt = tok->nextprompt;
966 if (newtok == NULL)
967 tok->done = E_INTR;
968 else if (*newtok == '\0') {
969 PyMem_FREE(newtok);
970 tok->done = E_EOF;
971 }
972 else if (tok->start != NULL) {
973 size_t start = tok->start - tok->buf;
974 size_t oldlen = tok->cur - tok->buf;
975 size_t newlen = oldlen + strlen(newtok);
976 char *buf = tok->buf;
977 buf = (char *)PyMem_REALLOC(buf, newlen+1);
978 tok->lineno++;
979 if (buf == NULL) {
980 PyMem_FREE(tok->buf);
981 tok->buf = NULL;
982 PyMem_FREE(newtok);
983 tok->done = E_NOMEM;
984 return EOF;
985 }
986 tok->buf = buf;
987 tok->cur = tok->buf + oldlen;
988 tok->line_start = tok->cur;
989 strcpy(tok->buf + oldlen, newtok);
990 PyMem_FREE(newtok);
991 tok->inp = tok->buf + newlen;
992 tok->end = tok->inp + 1;
993 tok->start = tok->buf + start;
994 }
995 else {
996 tok->lineno++;
997 if (tok->buf != NULL)
998 PyMem_FREE(tok->buf);
999 tok->buf = newtok;
1000 tok->line_start = tok->buf;
1001 tok->cur = tok->buf;
1002 tok->line_start = tok->buf;
1003 tok->inp = strchr(tok->buf, '\0');
1004 tok->end = tok->inp + 1;
1005 }
1006 }
1007 else {
1008 int done = 0;
1009 Py_ssize_t cur = 0;
1010 char *pt;
1011 if (tok->start == NULL) {
1012 if (tok->buf == NULL) {
1013 tok->buf = (char *)
1014 PyMem_MALLOC(BUFSIZ);
1015 if (tok->buf == NULL) {
1016 tok->done = E_NOMEM;
1017 return EOF;
1018 }
1019 tok->end = tok->buf + BUFSIZ;
1020 }
1021 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1022 tok) == NULL) {
1023 tok->done = E_EOF;
1024 done = 1;
1025 }
1026 else {
1027 tok->done = E_OK;
1028 tok->inp = strchr(tok->buf, '\0');
1029 done = tok->inp[-1] == '\n';
1030 }
1031 }
1032 else {
1033 cur = tok->cur - tok->buf;
1034 if (decoding_feof(tok)) {
1035 tok->done = E_EOF;
1036 done = 1;
1037 }
1038 else
1039 tok->done = E_OK;
1040 }
1041 tok->lineno++;
1042 /* Read until '\n' or EOF */
1043 while (!done) {
1044 Py_ssize_t curstart = tok->start == NULL ? -1 :
1045 tok->start - tok->buf;
1046 Py_ssize_t curvalid = tok->inp - tok->buf;
1047 Py_ssize_t newsize = curvalid + BUFSIZ;
1048 char *newbuf = tok->buf;
1049 newbuf = (char *)PyMem_REALLOC(newbuf,
1050 newsize);
1051 if (newbuf == NULL) {
1052 tok->done = E_NOMEM;
1053 tok->cur = tok->inp;
1054 return EOF;
1055 }
1056 tok->buf = newbuf;
1057 tok->inp = tok->buf + curvalid;
1058 tok->end = tok->buf + newsize;
1059 tok->start = curstart < 0 ? NULL :
1060 tok->buf + curstart;
1061 if (decoding_fgets(tok->inp,
1062 (int)(tok->end - tok->inp),
1063 tok) == NULL) {
1064 /* Break out early on decoding
1065 errors, as tok->buf will be NULL
1066 */
1067 if (tok->decoding_erred)
1068 return EOF;
1069 /* Last line does not end in \n,
1070 fake one */
1071 strcpy(tok->inp, "\n");
1072 }
1073 tok->inp = strchr(tok->inp, '\0');
1074 done = tok->inp[-1] == '\n';
1075 }
1076 if (tok->buf != NULL) {
1077 tok->cur = tok->buf + cur;
1078 tok->line_start = tok->cur;
1079 /* replace "\r\n" with "\n" */
1080 /* For Mac leave the \r, giving a syntax error */
1081 pt = tok->inp - 2;
1082 if (pt >= tok->buf && *pt == '\r') {
1083 *pt++ = '\n';
1084 *pt = '\0';
1085 tok->inp = pt;
1086 }
1087 }
1088 }
1089 if (tok->done != E_OK) {
1090 if (tok->prompt != NULL)
1091 PySys_WriteStderr("\n");
1092 tok->cur = tok->inp;
1093 return EOF;
1094 }
1095 }
1096 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097}
1098
1099
1100/* Back-up one character */
1101
1102static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001103tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 if (c != EOF) {
1106 if (--tok->cur < tok->buf)
1107 Py_FatalError("tok_backup: beginning of buffer");
1108 if (*tok->cur != c)
1109 *tok->cur = c;
1110 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001111}
1112
1113
1114/* Return the token corresponding to a single character */
1115
1116int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001117PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 switch (c) {
1120 case '(': return LPAR;
1121 case ')': return RPAR;
1122 case '[': return LSQB;
1123 case ']': return RSQB;
1124 case ':': return COLON;
1125 case ',': return COMMA;
1126 case ';': return SEMI;
1127 case '+': return PLUS;
1128 case '-': return MINUS;
1129 case '*': return STAR;
1130 case '/': return SLASH;
1131 case '|': return VBAR;
1132 case '&': return AMPER;
1133 case '<': return LESS;
1134 case '>': return GREATER;
1135 case '=': return EQUAL;
1136 case '.': return DOT;
1137 case '%': return PERCENT;
1138 case '{': return LBRACE;
1139 case '}': return RBRACE;
1140 case '^': return CIRCUMFLEX;
1141 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001142 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 default: return OP;
1144 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001145}
1146
1147
Guido van Rossumfbab9051991-10-20 20:25:03 +00001148int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001149PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001150{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 switch (c1) {
1152 case '=':
1153 switch (c2) {
1154 case '=': return EQEQUAL;
1155 }
1156 break;
1157 case '!':
1158 switch (c2) {
1159 case '=': return NOTEQUAL;
1160 }
1161 break;
1162 case '<':
1163 switch (c2) {
1164 case '>': return NOTEQUAL;
1165 case '=': return LESSEQUAL;
1166 case '<': return LEFTSHIFT;
1167 }
1168 break;
1169 case '>':
1170 switch (c2) {
1171 case '=': return GREATEREQUAL;
1172 case '>': return RIGHTSHIFT;
1173 }
1174 break;
1175 case '+':
1176 switch (c2) {
1177 case '=': return PLUSEQUAL;
1178 }
1179 break;
1180 case '-':
1181 switch (c2) {
1182 case '=': return MINEQUAL;
1183 case '>': return RARROW;
1184 }
1185 break;
1186 case '*':
1187 switch (c2) {
1188 case '*': return DOUBLESTAR;
1189 case '=': return STAREQUAL;
1190 }
1191 break;
1192 case '/':
1193 switch (c2) {
1194 case '/': return DOUBLESLASH;
1195 case '=': return SLASHEQUAL;
1196 }
1197 break;
1198 case '|':
1199 switch (c2) {
1200 case '=': return VBAREQUAL;
1201 }
1202 break;
1203 case '%':
1204 switch (c2) {
1205 case '=': return PERCENTEQUAL;
1206 }
1207 break;
1208 case '&':
1209 switch (c2) {
1210 case '=': return AMPEREQUAL;
1211 }
1212 break;
1213 case '^':
1214 switch (c2) {
1215 case '=': return CIRCUMFLEXEQUAL;
1216 }
1217 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001218 case '@':
1219 switch (c2) {
1220 case '=': return ATEQUAL;
1221 }
1222 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 }
1224 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001225}
1226
Thomas Wouters434d0822000-08-24 20:11:32 +00001227int
1228PyToken_ThreeChars(int c1, int c2, int c3)
1229{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 switch (c1) {
1231 case '<':
1232 switch (c2) {
1233 case '<':
1234 switch (c3) {
1235 case '=':
1236 return LEFTSHIFTEQUAL;
1237 }
1238 break;
1239 }
1240 break;
1241 case '>':
1242 switch (c2) {
1243 case '>':
1244 switch (c3) {
1245 case '=':
1246 return RIGHTSHIFTEQUAL;
1247 }
1248 break;
1249 }
1250 break;
1251 case '*':
1252 switch (c2) {
1253 case '*':
1254 switch (c3) {
1255 case '=':
1256 return DOUBLESTAREQUAL;
1257 }
1258 break;
1259 }
1260 break;
1261 case '/':
1262 switch (c2) {
1263 case '/':
1264 switch (c3) {
1265 case '=':
1266 return DOUBLESLASHEQUAL;
1267 }
1268 break;
1269 }
1270 break;
1271 case '.':
1272 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001273 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 switch (c3) {
1275 case '.':
1276 return ELLIPSIS;
1277 }
1278 break;
1279 }
1280 break;
1281 }
1282 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001283}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001284
Guido van Rossum926f13a1998-04-09 21:38:06 +00001285static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001286indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001287{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 if (tok->alterror) {
1289 tok->done = E_TABSPACE;
1290 tok->cur = tok->inp;
1291 return 1;
1292 }
1293 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001294#ifdef PGEN
1295 PySys_WriteStderr("inconsistent use of tabs and spaces "
1296 "in indentation\n");
1297#else
1298 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001300#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301 tok->altwarning = 0;
1302 }
1303 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001304}
1305
Martin v. Löwis47383402007-08-15 07:32:56 +00001306#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001307#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001308#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309/* Verify that the identifier follows PEP 3131.
1310 All identifier strings are guaranteed to be "ready" unicode objects.
1311 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001312static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001313verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001314{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 PyObject *s;
1316 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001317 if (tok->decoding_erred)
1318 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1322 PyErr_Clear();
1323 tok->done = E_IDENTIFIER;
1324 } else {
1325 tok->done = E_ERROR;
1326 }
1327 return 0;
1328 }
1329 result = PyUnicode_IsIdentifier(s);
1330 Py_DECREF(s);
1331 if (result == 0)
1332 tok->done = E_IDENTIFIER;
1333 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001334}
1335#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001336
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001337/* Get next token, after space stripping etc. */
1338
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001339static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001340tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001342 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001344
Yury Selivanov75445082015-05-11 22:57:16 -04001345 int tok_len;
1346 struct tok_state ahead_tok;
1347 char *ahead_tok_start = NULL, *ahead_top_end = NULL;
1348 int ahead_tok_kind;
1349
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001351 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352 tok->start = NULL;
1353 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001354
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355 /* Get indentation level */
1356 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001357 int col = 0;
1358 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 tok->atbol = 0;
1360 for (;;) {
1361 c = tok_nextc(tok);
1362 if (c == ' ')
1363 col++, altcol++;
1364 else if (c == '\t') {
1365 col = (col/tok->tabsize + 1) * tok->tabsize;
1366 altcol = (altcol/tok->alttabsize + 1)
1367 * tok->alttabsize;
1368 }
1369 else if (c == '\014') /* Control-L (formfeed) */
1370 col = altcol = 0; /* For Emacs users */
1371 else
1372 break;
1373 }
1374 tok_backup(tok, c);
1375 if (c == '#' || c == '\n') {
1376 /* Lines with only whitespace and/or comments
1377 shouldn't affect the indentation and are
1378 not passed to the parser as NEWLINE tokens,
1379 except *totally* empty lines in interactive
1380 mode, which signal the end of a command group. */
1381 if (col == 0 && c == '\n' && tok->prompt != NULL)
1382 blankline = 0; /* Let it through */
1383 else
1384 blankline = 1; /* Ignore completely */
1385 /* We can't jump back right here since we still
1386 may need to skip to the end of a comment */
1387 }
1388 if (!blankline && tok->level == 0) {
1389 if (col == tok->indstack[tok->indent]) {
1390 /* No change */
1391 if (altcol != tok->altindstack[tok->indent]) {
1392 if (indenterror(tok))
1393 return ERRORTOKEN;
1394 }
1395 }
1396 else if (col > tok->indstack[tok->indent]) {
1397 /* Indent -- always one */
1398 if (tok->indent+1 >= MAXINDENT) {
1399 tok->done = E_TOODEEP;
1400 tok->cur = tok->inp;
1401 return ERRORTOKEN;
1402 }
1403 if (altcol <= tok->altindstack[tok->indent]) {
1404 if (indenterror(tok))
1405 return ERRORTOKEN;
1406 }
1407 tok->pendin++;
1408 tok->indstack[++tok->indent] = col;
1409 tok->altindstack[tok->indent] = altcol;
1410 }
1411 else /* col < tok->indstack[tok->indent] */ {
1412 /* Dedent -- any number, must be consistent */
1413 while (tok->indent > 0 &&
1414 col < tok->indstack[tok->indent]) {
1415 tok->pendin--;
1416 tok->indent--;
1417 }
1418 if (col != tok->indstack[tok->indent]) {
1419 tok->done = E_DEDENT;
1420 tok->cur = tok->inp;
1421 return ERRORTOKEN;
1422 }
1423 if (altcol != tok->altindstack[tok->indent]) {
1424 if (indenterror(tok))
1425 return ERRORTOKEN;
1426 }
1427 }
1428 }
1429 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001430
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 /* Return pending indents/dedents */
1434 if (tok->pendin != 0) {
1435 if (tok->pendin < 0) {
1436 tok->pendin++;
Yury Selivanov75445082015-05-11 22:57:16 -04001437
1438 while (tok->def && tok->defstack[tok->def] >= tok->indent) {
1439 tok->def--;
1440 }
1441
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 return DEDENT;
1443 }
1444 else {
1445 tok->pendin--;
1446 return INDENT;
1447 }
1448 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001449
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001450 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 tok->start = NULL;
1452 /* Skip spaces */
1453 do {
1454 c = tok_nextc(tok);
1455 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001456
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001457 /* Set start of current token */
1458 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 /* Skip comment */
1461 if (c == '#')
1462 while (c != EOF && c != '\n')
1463 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001464
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001465 /* Check for EOF and errors now */
1466 if (c == EOF) {
1467 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1468 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001470 /* Identifier (most frequent token!) */
1471 nonascii = 0;
1472 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001473 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001474 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001475 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001476 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001477 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001478 /* Since this is a backwards compatibility support literal we don't
1479 want to support it in arbitrary order like byte literals. */
1480 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1481 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001482 /* ur"" and ru"" are not supported */
1483 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001484 saw_r = 1;
1485 else
1486 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001487 c = tok_nextc(tok);
1488 if (c == '"' || c == '\'')
1489 goto letter_quote;
1490 }
1491 while (is_potential_identifier_char(c)) {
1492 if (c >= 128)
1493 nonascii = 1;
1494 c = tok_nextc(tok);
1495 }
1496 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001497 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001498 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 *p_start = tok->start;
1500 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001501
1502 tok_len = tok->cur - tok->start;
1503 if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
Yury Selivanov75445082015-05-11 22:57:16 -04001504 if (tok->def && tok->deftypestack[tok->def] == 3) {
1505 tok->deftypestack[tok->def] = 2;
1506 }
Yury Selivanov8085b802015-05-18 12:50:52 -04001507 else if (tok->defstack[tok->def] < tok->indent) {
1508 /* We advance defs stack only when we see "def" *and*
1509 the indentation level was increased relative to the
1510 previous "def". */
1511
1512 if (tok->def + 1 >= MAXINDENT) {
1513 tok->done = E_TOODEEP;
1514 tok->cur = tok->inp;
1515 return ERRORTOKEN;
1516 }
1517
Yury Selivanov75445082015-05-11 22:57:16 -04001518 tok->def++;
1519 tok->defstack[tok->def] = tok->indent;
1520 tok->deftypestack[tok->def] = 1;
1521 }
1522 }
1523 else if (tok_len == 5) {
1524 if (memcmp(tok->start, "async", 5) == 0) {
1525 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1526
1527 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1528 &ahead_top_end);
1529
1530 if (ahead_tok_kind == NAME &&
1531 ahead_tok.cur - ahead_tok.start == 3 &&
1532 memcmp(ahead_tok.start, "def", 3) == 0) {
1533
Yury Selivanov8085b802015-05-18 12:50:52 -04001534 if (tok->def + 1 >= MAXINDENT) {
1535 tok->done = E_TOODEEP;
1536 tok->cur = tok->inp;
1537 return ERRORTOKEN;
1538 }
1539
Yury Selivanov75445082015-05-11 22:57:16 -04001540 tok->def++;
1541 tok->defstack[tok->def] = tok->indent;
1542 tok->deftypestack[tok->def] = 3;
1543
1544 return ASYNC;
1545 }
1546 else if (tok->def && tok->deftypestack[tok->def] == 2
1547 && tok->defstack[tok->def] < tok->indent) {
1548
1549 return ASYNC;
1550 }
1551
1552 }
1553 else if (memcmp(tok->start, "await", 5) == 0
1554 && tok->def && tok->deftypestack[tok->def] == 2
1555 && tok->defstack[tok->def] < tok->indent) {
1556
1557 return AWAIT;
1558 }
1559 }
1560
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 return NAME;
1562 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001563
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001564 /* Newline */
1565 if (c == '\n') {
1566 tok->atbol = 1;
1567 if (blankline || tok->level > 0)
1568 goto nextline;
1569 *p_start = tok->start;
1570 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1571 tok->cont_line = 0;
1572 return NEWLINE;
1573 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001574
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 /* Period or number starting with period? */
1576 if (c == '.') {
1577 c = tok_nextc(tok);
1578 if (isdigit(c)) {
1579 goto fraction;
1580 } else if (c == '.') {
1581 c = tok_nextc(tok);
1582 if (c == '.') {
1583 *p_start = tok->start;
1584 *p_end = tok->cur;
1585 return ELLIPSIS;
1586 } else {
1587 tok_backup(tok, c);
1588 }
1589 tok_backup(tok, '.');
1590 } else {
1591 tok_backup(tok, c);
1592 }
1593 *p_start = tok->start;
1594 *p_end = tok->cur;
1595 return DOT;
1596 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001597
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 /* Number */
1599 if (isdigit(c)) {
1600 if (c == '0') {
1601 /* Hex, octal or binary -- maybe. */
1602 c = tok_nextc(tok);
1603 if (c == '.')
1604 goto fraction;
1605 if (c == 'j' || c == 'J')
1606 goto imaginary;
1607 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001608
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 /* Hex */
1610 c = tok_nextc(tok);
1611 if (!isxdigit(c)) {
1612 tok->done = E_TOKEN;
1613 tok_backup(tok, c);
1614 return ERRORTOKEN;
1615 }
1616 do {
1617 c = tok_nextc(tok);
1618 } while (isxdigit(c));
1619 }
1620 else if (c == 'o' || c == 'O') {
1621 /* Octal */
1622 c = tok_nextc(tok);
1623 if (c < '0' || c >= '8') {
1624 tok->done = E_TOKEN;
1625 tok_backup(tok, c);
1626 return ERRORTOKEN;
1627 }
1628 do {
1629 c = tok_nextc(tok);
1630 } while ('0' <= c && c < '8');
1631 }
1632 else if (c == 'b' || c == 'B') {
1633 /* Binary */
1634 c = tok_nextc(tok);
1635 if (c != '0' && c != '1') {
1636 tok->done = E_TOKEN;
1637 tok_backup(tok, c);
1638 return ERRORTOKEN;
1639 }
1640 do {
1641 c = tok_nextc(tok);
1642 } while (c == '0' || c == '1');
1643 }
1644 else {
1645 int nonzero = 0;
1646 /* maybe old-style octal; c is first char of it */
1647 /* in any case, allow '0' as a literal */
1648 while (c == '0')
1649 c = tok_nextc(tok);
1650 while (isdigit(c)) {
1651 nonzero = 1;
1652 c = tok_nextc(tok);
1653 }
1654 if (c == '.')
1655 goto fraction;
1656 else if (c == 'e' || c == 'E')
1657 goto exponent;
1658 else if (c == 'j' || c == 'J')
1659 goto imaginary;
1660 else if (nonzero) {
1661 tok->done = E_TOKEN;
1662 tok_backup(tok, c);
1663 return ERRORTOKEN;
1664 }
1665 }
1666 }
1667 else {
1668 /* Decimal */
1669 do {
1670 c = tok_nextc(tok);
1671 } while (isdigit(c));
1672 {
1673 /* Accept floating point numbers. */
1674 if (c == '.') {
1675 fraction:
1676 /* Fraction */
1677 do {
1678 c = tok_nextc(tok);
1679 } while (isdigit(c));
1680 }
1681 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001682 int e;
1683 exponent:
1684 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 /* Exponent part */
1686 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001687 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001689 if (!isdigit(c)) {
1690 tok->done = E_TOKEN;
1691 tok_backup(tok, c);
1692 return ERRORTOKEN;
1693 }
1694 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001696 tok_backup(tok, e);
1697 *p_start = tok->start;
1698 *p_end = tok->cur;
1699 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 }
1701 do {
1702 c = tok_nextc(tok);
1703 } while (isdigit(c));
1704 }
1705 if (c == 'j' || c == 'J')
1706 /* Imaginary part */
1707 imaginary:
1708 c = tok_nextc(tok);
1709 }
1710 }
1711 tok_backup(tok, c);
1712 *p_start = tok->start;
1713 *p_end = tok->cur;
1714 return NUMBER;
1715 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001716
1717 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 /* String */
1719 if (c == '\'' || c == '"') {
1720 int quote = c;
1721 int quote_size = 1; /* 1 or 3 */
1722 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001723
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 /* Find the quote size and start of string */
1725 c = tok_nextc(tok);
1726 if (c == quote) {
1727 c = tok_nextc(tok);
1728 if (c == quote)
1729 quote_size = 3;
1730 else
1731 end_quote_size = 1; /* empty string found */
1732 }
1733 if (c != quote)
1734 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001735
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736 /* Get rest of string */
1737 while (end_quote_size != quote_size) {
1738 c = tok_nextc(tok);
1739 if (c == EOF) {
1740 if (quote_size == 3)
1741 tok->done = E_EOFS;
1742 else
1743 tok->done = E_EOLS;
1744 tok->cur = tok->inp;
1745 return ERRORTOKEN;
1746 }
1747 if (quote_size == 1 && c == '\n') {
1748 tok->done = E_EOLS;
1749 tok->cur = tok->inp;
1750 return ERRORTOKEN;
1751 }
1752 if (c == quote)
1753 end_quote_size += 1;
1754 else {
1755 end_quote_size = 0;
1756 if (c == '\\')
1757 c = tok_nextc(tok); /* skip escaped char */
1758 }
1759 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001760
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001761 *p_start = tok->start;
1762 *p_end = tok->cur;
1763 return STRING;
1764 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001765
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 /* Line continuation */
1767 if (c == '\\') {
1768 c = tok_nextc(tok);
1769 if (c != '\n') {
1770 tok->done = E_LINECONT;
1771 tok->cur = tok->inp;
1772 return ERRORTOKEN;
1773 }
1774 tok->cont_line = 1;
1775 goto again; /* Read next line */
1776 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001777
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001778 /* Check for two-character token */
1779 {
1780 int c2 = tok_nextc(tok);
1781 int token = PyToken_TwoChars(c, c2);
1782 if (token != OP) {
1783 int c3 = tok_nextc(tok);
1784 int token3 = PyToken_ThreeChars(c, c2, c3);
1785 if (token3 != OP) {
1786 token = token3;
1787 } else {
1788 tok_backup(tok, c3);
1789 }
1790 *p_start = tok->start;
1791 *p_end = tok->cur;
1792 return token;
1793 }
1794 tok_backup(tok, c2);
1795 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001796
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001797 /* Keep track of parentheses nesting level */
1798 switch (c) {
1799 case '(':
1800 case '[':
1801 case '{':
1802 tok->level++;
1803 break;
1804 case ')':
1805 case ']':
1806 case '}':
1807 tok->level--;
1808 break;
1809 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001810
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 /* Punctuation character */
1812 *p_start = tok->start;
1813 *p_end = tok->cur;
1814 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001815}
1816
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001817int
1818PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1819{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001820 int result = tok_get(tok, p_start, p_end);
1821 if (tok->decoding_erred) {
1822 result = ERRORTOKEN;
1823 tok->done = E_DECODE;
1824 }
1825 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001826}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001827
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001828/* Get the encoding of a Python file. Check for the coding cookie and check if
1829 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001830
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001831 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1832 encoding in the first or second line of the file (in which case the encoding
1833 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001834
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001835 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1836 by the caller. */
1837
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001838char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001839PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001840{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001841 struct tok_state *tok;
1842 FILE *fp;
1843 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001844
Victor Stinnerdaf45552013-08-28 00:53:59 +02001845#ifndef PGEN
1846 fd = _Py_dup(fd);
1847#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001848 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001849#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 if (fd < 0) {
1851 return NULL;
1852 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001853
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001854 fp = fdopen(fd, "r");
1855 if (fp == NULL) {
1856 return NULL;
1857 }
1858 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1859 if (tok == NULL) {
1860 fclose(fp);
1861 return NULL;
1862 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001863#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001864 if (filename != NULL) {
1865 Py_INCREF(filename);
1866 tok->filename = filename;
1867 }
1868 else {
1869 tok->filename = PyUnicode_FromString("<string>");
1870 if (tok->filename == NULL) {
1871 fclose(fp);
1872 PyTokenizer_Free(tok);
1873 return encoding;
1874 }
1875 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001876#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 while (tok->lineno < 2 && tok->done == E_OK) {
1878 PyTokenizer_Get(tok, &p_start, &p_end);
1879 }
1880 fclose(fp);
1881 if (tok->encoding) {
1882 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1883 if (encoding)
1884 strcpy(encoding, tok->encoding);
1885 }
1886 PyTokenizer_Free(tok);
1887 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001888}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001889
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001890char *
1891PyTokenizer_FindEncoding(int fd)
1892{
1893 return PyTokenizer_FindEncodingFilename(fd, NULL);
1894}
1895
Guido van Rossum408027e1996-12-30 16:17:54 +00001896#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001897
1898void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001899tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001900{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001901 printf("%s", _PyParser_TokenNames[type]);
1902 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1903 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001904}
1905
1906#endif