blob: 51f98e9b2e9213aa775243ad200e199e701a9d17 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +0100109 "COMMENT",
110 "NL",
Albert-Jan Nijburgc9ccace2017-06-01 21:51:27 +0100111 "ENCODING",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000113};
114
115
116/* Create and initialize a new tok_state structure */
117
118static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000119tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
122 sizeof(struct tok_state));
123 if (tok == NULL)
124 return NULL;
125 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
126 tok->done = E_OK;
127 tok->fp = NULL;
128 tok->input = NULL;
129 tok->tabsize = TABSIZE;
130 tok->indent = 0;
131 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400132
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
137 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000138 tok->altwarning = 1;
139 tok->alterror = 1;
140 tok->alttabsize = 1;
141 tok->altindstack[0] = 0;
142 tok->decoding_state = STATE_INIT;
143 tok->decoding_erred = 0;
144 tok->read_coding_spec = 0;
145 tok->enc = NULL;
146 tok->encoding = NULL;
147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200149 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 tok->decoding_readline = NULL;
151 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000152#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300153
154 tok->async_def = 0;
155 tok->async_def_indent = 0;
156 tok->async_def_nl = 0;
157
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000159}
160
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000161static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000163{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000164 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700165 if (!result) {
166 tok->done = E_NOMEM;
167 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700169 memcpy(result, s, len);
170 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000172}
173
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174#ifdef PGEN
175
176static char *
177decoding_fgets(char *s, int size, struct tok_state *tok)
178{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182static int
183decoding_feof(struct tok_state *tok)
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186}
187
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000188static char *
189decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700191 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
194#else /* PGEN */
195
196static char *
197error_ret(struct tok_state *tok) /* XXX */
198{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000199 tok->decoding_erred = 1;
200 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
201 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200202 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
203 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000204 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205}
206
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200208static const char *
209get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 char buf[13];
212 int i;
213 for (i = 0; i < 12; i++) {
214 int c = s[i];
215 if (c == '\0')
216 break;
217 else if (c == '_')
218 buf[i] = '-';
219 else
220 buf[i] = tolower(c);
221 }
222 buf[i] = '\0';
223 if (strcmp(buf, "utf-8") == 0 ||
224 strncmp(buf, "utf-8-", 6) == 0)
225 return "utf-8";
226 else if (strcmp(buf, "latin-1") == 0 ||
227 strcmp(buf, "iso-8859-1") == 0 ||
228 strcmp(buf, "iso-latin-1") == 0 ||
229 strncmp(buf, "latin-1-", 8) == 0 ||
230 strncmp(buf, "iso-8859-1-", 11) == 0 ||
231 strncmp(buf, "iso-latin-1-", 12) == 0)
232 return "iso-8859-1";
233 else
234 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235}
236
237/* Return the coding spec in S, or NULL if none is found. */
238
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700239static int
240get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700243 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 /* Coding spec must be in a comment, and that comment must be
245 * the only statement on the source code line. */
246 for (i = 0; i < size - 6; i++) {
247 if (s[i] == '#')
248 break;
249 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700250 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000251 }
252 for (; i < size - 6; i++) { /* XXX inefficient search */
253 const char* t = s + i;
254 if (strncmp(t, "coding", 6) == 0) {
255 const char* begin = NULL;
256 t += 6;
257 if (t[0] != ':' && t[0] != '=')
258 continue;
259 do {
260 t++;
261 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000262
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 begin = t;
264 while (Py_ISALNUM(t[0]) ||
265 t[0] == '-' || t[0] == '_' || t[0] == '.')
266 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700269 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200270 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 if (!r)
272 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700273 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 if (r != q) {
275 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700276 r = new_string(q, strlen(q), tok);
277 if (!r)
278 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700280 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200281 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 }
283 }
284 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700285 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286}
287
288/* Check whether the line contains a coding spec. If it does,
289 invoke the set_readline function for the new encoding.
290 This function receives the tok_state and the new encoding.
291 Return 1 on success, 0 on failure. */
292
293static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000295 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700297 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000299
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200300 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200302 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 if (!get_coding_spec(line, &cs, size, tok))
306 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200307 if (!cs) {
308 Py_ssize_t i;
309 for (i = 0; i < size; i++) {
310 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
311 break;
312 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
313 /* Stop checking coding spec after a line containing
314 * anything except a comment. */
315 tok->read_coding_spec = 1;
316 break;
317 }
318 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700319 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200320 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700321 tok->read_coding_spec = 1;
322 if (tok->encoding == NULL) {
323 assert(tok->decoding_state == STATE_RAW);
324 if (strcmp(cs, "utf-8") == 0) {
325 tok->encoding = cs;
326 } else {
327 r = set_readline(tok, cs);
328 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700330 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700332 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300333 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700334 "encoding problem: %s", cs);
335 PyMem_FREE(cs);
336 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000337 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700338 } else { /* then, compare cs with BOM */
339 r = (strcmp(tok->encoding, cs) == 0);
340 if (!r)
341 PyErr_Format(PyExc_SyntaxError,
342 "encoding problem: %s with BOM", cs);
343 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000345 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346}
347
348/* See whether the file starts with a BOM. If it does,
349 invoke the set_readline function with the new encoding.
350 Return 1 on success, 0 on failure. */
351
352static int
353check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000354 void unget_char(int, struct tok_state *),
355 int set_readline(struct tok_state *, const char *),
356 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000357{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 int ch1, ch2, ch3;
359 ch1 = get_char(tok);
360 tok->decoding_state = STATE_RAW;
361 if (ch1 == EOF) {
362 return 1;
363 } else if (ch1 == 0xEF) {
364 ch2 = get_char(tok);
365 if (ch2 != 0xBB) {
366 unget_char(ch2, tok);
367 unget_char(ch1, tok);
368 return 1;
369 }
370 ch3 = get_char(tok);
371 if (ch3 != 0xBF) {
372 unget_char(ch3, tok);
373 unget_char(ch2, tok);
374 unget_char(ch1, tok);
375 return 1;
376 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000377#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 /* Disable support for UTF-16 BOMs until a decision
379 is made whether this needs to be supported. */
380 } else if (ch1 == 0xFE) {
381 ch2 = get_char(tok);
382 if (ch2 != 0xFF) {
383 unget_char(ch2, tok);
384 unget_char(ch1, tok);
385 return 1;
386 }
387 if (!set_readline(tok, "utf-16-be"))
388 return 0;
389 tok->decoding_state = STATE_NORMAL;
390 } else if (ch1 == 0xFF) {
391 ch2 = get_char(tok);
392 if (ch2 != 0xFE) {
393 unget_char(ch2, tok);
394 unget_char(ch1, tok);
395 return 1;
396 }
397 if (!set_readline(tok, "utf-16-le"))
398 return 0;
399 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 } else {
402 unget_char(ch1, tok);
403 return 1;
404 }
405 if (tok->encoding != NULL)
406 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700407 tok->encoding = new_string("utf-8", 5, tok);
408 if (!tok->encoding)
409 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000410 /* No need to set_readline: input is already utf-8 */
411 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412}
413
414/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000415 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000416
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000417 On entry, tok->decoding_buffer will be one of:
418 1) NULL: need to call tok->decoding_readline to get a new line
419 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000420 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000421 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 (in the s buffer) to copy entire contents of the line read
423 by tok->decoding_readline. tok->decoding_buffer has the overflow.
424 In this case, fp_readl is called in a loop (with an expanded buffer)
425 until the buffer ends with a '\n' (or until the end of the file is
426 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000427*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428
429static char *
430fp_readl(char *s, int size, struct tok_state *tok)
431{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000432 PyObject* bufobj;
433 const char *buf;
434 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000435
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000436 /* Ask for one less byte so we can terminate it */
437 assert(size > 0);
438 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000439
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440 if (tok->decoding_buffer) {
441 bufobj = tok->decoding_buffer;
442 Py_INCREF(bufobj);
443 }
444 else
445 {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100446 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 if (bufobj == NULL)
448 goto error;
449 }
450 if (PyUnicode_CheckExact(bufobj))
451 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200452 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 if (buf == NULL) {
454 goto error;
455 }
456 }
457 else
458 {
459 buf = PyByteArray_AsString(bufobj);
460 if (buf == NULL) {
461 goto error;
462 }
463 buflen = PyByteArray_GET_SIZE(bufobj);
464 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000465
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000466 Py_XDECREF(tok->decoding_buffer);
467 if (buflen > size) {
468 /* Too many chars, the rest goes into tok->decoding_buffer */
469 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
470 buflen-size);
471 if (tok->decoding_buffer == NULL)
472 goto error;
473 buflen = size;
474 }
475 else
476 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000477
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000478 memcpy(s, buf, buflen);
479 s[buflen] = '\0';
480 if (buflen == 0) /* EOF */
481 s = NULL;
482 Py_DECREF(bufobj);
483 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000484
485error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 Py_XDECREF(bufobj);
487 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488}
489
490/* Set the readline function for TOK to a StreamReader's
491 readline function. The StreamReader is named ENC.
492
493 This function is called from check_bom and check_coding_spec.
494
495 ENC is usually identical to the future value of tok->encoding,
496 except for the (currently unsupported) case of UTF-16.
497
498 Return 1 on success, 0 on failure. */
499
500static int
501fp_setreadl(struct tok_state *tok, const char* enc)
502{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700503 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200504 _Py_IDENTIFIER(open);
505 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000506 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200507 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000508
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200510 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100511 * position of tok->fp. If tok->fp was opened in text mode on Windows,
512 * its file position counts CRLF as one char and can't be directly mapped
513 * to the file offset for fd. Instead we step back one byte and read to
514 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200515 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100516 if (pos == -1 ||
517 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000518 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700519 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000520 }
521
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700522 io = PyImport_ImportModuleNoBlock("io");
523 if (io == NULL)
524 return 0;
525
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200526 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000527 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700528 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000529 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700530 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200532 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700533 Py_DECREF(stream);
534 if (readline == NULL)
535 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300536 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700537
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100538 if (pos > 0) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100539 PyObject *bufobj = _PyObject_CallNoArg(readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700540 if (bufobj == NULL)
541 return 0;
542 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100543 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000544
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700545 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546}
547
548/* Fetch the next byte from TOK. */
549
550static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552}
553
554/* Unfetch the last byte back into TOK. */
555
556static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558}
559
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000560/* Check whether the characters at s start a valid
561 UTF-8 sequence. Return the number of characters forming
562 the sequence if yes, 0 if not. */
563static int valid_utf8(const unsigned char* s)
564{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000565 int expected = 0;
566 int length;
567 if (*s < 0x80)
568 /* single-byte code */
569 return 1;
570 if (*s < 0xc0)
571 /* following byte */
572 return 0;
573 if (*s < 0xE0)
574 expected = 1;
575 else if (*s < 0xF0)
576 expected = 2;
577 else if (*s < 0xF8)
578 expected = 3;
579 else
580 return 0;
581 length = expected + 1;
582 for (; expected; expected--)
583 if (s[expected] < 0x80 || s[expected] >= 0xC0)
584 return 0;
585 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000586}
587
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588/* Read a line of input from TOK. Determine encoding
589 if necessary. */
590
591static char *
592decoding_fgets(char *s, int size, struct tok_state *tok)
593{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000594 char *line = NULL;
595 int badchar = 0;
596 for (;;) {
597 if (tok->decoding_state == STATE_NORMAL) {
598 /* We already have a codec associated with
599 this input. */
600 line = fp_readl(s, size, tok);
601 break;
602 } else if (tok->decoding_state == STATE_RAW) {
603 /* We want a 'raw' read. */
604 line = Py_UniversalNewlineFgets(s, size,
605 tok->fp, NULL);
606 break;
607 } else {
608 /* We have not yet determined the encoding.
609 If an encoding is found, use the file-pointer
610 reader functions from now on. */
611 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
612 return error_ret(tok);
613 assert(tok->decoding_state != STATE_INIT);
614 }
615 }
616 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
617 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
618 return error_ret(tok);
619 }
620 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000622 /* The default encoding is UTF-8, so make sure we don't have any
623 non-UTF-8 sequences in it. */
624 if (line && !tok->encoding) {
625 unsigned char *c;
626 int length;
627 for (c = (unsigned char *)line; *c; c += length)
628 if (!(length = valid_utf8(c))) {
629 badchar = *c;
630 break;
631 }
632 }
633 if (badchar) {
634 /* Need to add 1 to the line number, since this line
635 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200636 PyErr_Format(PyExc_SyntaxError,
637 "Non-UTF-8 code starting with '\\x%.2x' "
638 "in file %U on line %i, "
639 "but no encoding declared; "
640 "see http://python.org/dev/peps/pep-0263/ for details",
641 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 return error_ret(tok);
643 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646}
647
648static int
649decoding_feof(struct tok_state *tok)
650{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 if (tok->decoding_state != STATE_NORMAL) {
652 return feof(tok->fp);
653 } else {
654 PyObject* buf = tok->decoding_buffer;
655 if (buf == NULL) {
Victor Stinnera5ed5f02016-12-06 18:45:50 +0100656 buf = _PyObject_CallNoArg(tok->decoding_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 if (buf == NULL) {
658 error_ret(tok);
659 return 1;
660 } else {
661 tok->decoding_buffer = buf;
662 }
663 }
664 return PyObject_Length(buf) == 0;
665 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666}
667
668/* Fetch a byte from TOK, using the string buffer. */
669
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000670static int
671buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000673}
674
675/* Unfetch a byte from TOK, using the string buffer. */
676
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000677static void
678buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 tok->str--;
680 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000681}
682
683/* Set the readline function for TOK to ENC. For the string-based
684 tokenizer, this means to just record the encoding. */
685
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000686static int
687buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000688 tok->enc = enc;
689 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000690}
691
692/* Return a UTF-8 encoding Python string object from the
693 C byte string STR, which is encoded with ENC. */
694
695static PyObject *
696translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000697 PyObject *utf8;
698 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
699 if (buf == NULL)
700 return NULL;
701 utf8 = PyUnicode_AsUTF8String(buf);
702 Py_DECREF(buf);
703 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000704}
705
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000706
707static char *
708translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200709 int skip_next_lf = 0;
710 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 char *buf, *current;
712 char c = '\0';
713 buf = PyMem_MALLOC(needed_length);
714 if (buf == NULL) {
715 tok->done = E_NOMEM;
716 return NULL;
717 }
718 for (current = buf; *s; s++, current++) {
719 c = *s;
720 if (skip_next_lf) {
721 skip_next_lf = 0;
722 if (c == '\n') {
723 c = *++s;
724 if (!c)
725 break;
726 }
727 }
728 if (c == '\r') {
729 skip_next_lf = 1;
730 c = '\n';
731 }
732 *current = c;
733 }
734 /* If this is exec input, add a newline to the end of the string if
735 there isn't one already. */
736 if (exec_input && c != '\n') {
737 *current = '\n';
738 current++;
739 }
740 *current = '\0';
741 final_length = current - buf + 1;
742 if (final_length < needed_length && final_length)
743 /* should never fail */
744 buf = PyMem_REALLOC(buf, final_length);
745 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000746}
747
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748/* Decode a byte string STR for use as the buffer of TOK.
749 Look for encoding declarations inside STR, and record them
750 inside TOK. */
751
752static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000753decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000754{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 PyObject* utf8 = NULL;
756 const char *str;
757 const char *s;
758 const char *newl[2] = {NULL, NULL};
759 int lineno = 0;
760 tok->input = str = translate_newlines(input, single, tok);
761 if (str == NULL)
762 return NULL;
763 tok->enc = NULL;
764 tok->str = str;
765 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
766 return error_ret(tok);
767 str = tok->str; /* string after BOM if any */
768 assert(str);
769 if (tok->enc != NULL) {
770 utf8 = translate_into_utf8(str, tok->enc);
771 if (utf8 == NULL)
772 return error_ret(tok);
773 str = PyBytes_AsString(utf8);
774 }
775 for (s = str;; s++) {
776 if (*s == '\0') break;
777 else if (*s == '\n') {
778 assert(lineno < 2);
779 newl[lineno] = s;
780 lineno++;
781 if (lineno == 2) break;
782 }
783 }
784 tok->enc = NULL;
785 /* need to check line 1 and 2 separately since check_coding_spec
786 assumes a single line as input */
787 if (newl[0]) {
788 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
789 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200790 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
792 tok, buf_setreadl))
793 return error_ret(tok);
794 }
795 }
796 if (tok->enc != NULL) {
797 assert(utf8 == NULL);
798 utf8 = translate_into_utf8(str, tok->enc);
799 if (utf8 == NULL)
800 return error_ret(tok);
801 str = PyBytes_AS_STRING(utf8);
802 }
803 assert(tok->decoding_buffer == NULL);
804 tok->decoding_buffer = utf8; /* CAUTION */
805 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000806}
807
808#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809
810/* Set up tokenizer for string */
811
812struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000813PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000814{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000815 struct tok_state *tok = tok_new();
816 if (tok == NULL)
817 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300818 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 if (str == NULL) {
820 PyTokenizer_Free(tok);
821 return NULL;
822 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000823
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 /* XXX: constify members. */
825 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
826 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000827}
828
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000829struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000830PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000831{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 struct tok_state *tok = tok_new();
833 if (tok == NULL)
834 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000835#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000837#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 if (str == NULL) {
839 PyTokenizer_Free(tok);
840 return NULL;
841 }
842 tok->decoding_state = STATE_RAW;
843 tok->read_coding_spec = 1;
844 tok->enc = NULL;
845 tok->str = str;
846 tok->encoding = (char *)PyMem_MALLOC(6);
847 if (!tok->encoding) {
848 PyTokenizer_Free(tok);
849 return NULL;
850 }
851 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000852
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 /* XXX: constify members. */
854 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
855 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000856}
857
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000858/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859
860struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300861PyTokenizer_FromFile(FILE *fp, const char* enc,
862 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 struct tok_state *tok = tok_new();
865 if (tok == NULL)
866 return NULL;
867 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
868 PyTokenizer_Free(tok);
869 return NULL;
870 }
871 tok->cur = tok->inp = tok->buf;
872 tok->end = tok->buf + BUFSIZ;
873 tok->fp = fp;
874 tok->prompt = ps1;
875 tok->nextprompt = ps2;
876 if (enc != NULL) {
877 /* Must copy encoding declaration since it
878 gets copied into the parse tree. */
879 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
880 if (!tok->encoding) {
881 PyTokenizer_Free(tok);
882 return NULL;
883 }
884 strcpy(tok->encoding, enc);
885 tok->decoding_state = STATE_NORMAL;
886 }
887 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000888}
889
890
891/* Free a tok_state structure */
892
893void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000894PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000895{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 if (tok->encoding != NULL)
897 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000898#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 Py_XDECREF(tok->decoding_readline);
900 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200901 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000902#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903 if (tok->fp != NULL && tok->buf != NULL)
904 PyMem_FREE(tok->buf);
905 if (tok->input)
906 PyMem_FREE((char *)tok->input);
907 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908}
909
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910/* Get next char, updating state; error code goes into tok->done */
911
912static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200913tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 for (;;) {
916 if (tok->cur != tok->inp) {
917 return Py_CHARMASK(*tok->cur++); /* Fast path */
918 }
919 if (tok->done != E_OK)
920 return EOF;
921 if (tok->fp == NULL) {
922 char *end = strchr(tok->inp, '\n');
923 if (end != NULL)
924 end++;
925 else {
926 end = strchr(tok->inp, '\0');
927 if (end == tok->inp) {
928 tok->done = E_EOF;
929 return EOF;
930 }
931 }
932 if (tok->start == NULL)
933 tok->buf = tok->cur;
934 tok->line_start = tok->cur;
935 tok->lineno++;
936 tok->inp = end;
937 return Py_CHARMASK(*tok->cur++);
938 }
939 if (tok->prompt != NULL) {
940 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000941#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000942 if (newtok != NULL) {
943 char *translated = translate_newlines(newtok, 0, tok);
944 PyMem_FREE(newtok);
945 if (translated == NULL)
946 return EOF;
947 newtok = translated;
948 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000949 if (tok->encoding && newtok && *newtok) {
950 /* Recode to UTF-8 */
951 Py_ssize_t buflen;
952 const char* buf;
953 PyObject *u = translate_into_utf8(newtok, tok->encoding);
954 PyMem_FREE(newtok);
955 if (!u) {
956 tok->done = E_DECODE;
957 return EOF;
958 }
959 buflen = PyBytes_GET_SIZE(u);
960 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000961 newtok = PyMem_MALLOC(buflen+1);
962 strcpy(newtok, buf);
963 Py_DECREF(u);
964 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000965#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 if (tok->nextprompt != NULL)
967 tok->prompt = tok->nextprompt;
968 if (newtok == NULL)
969 tok->done = E_INTR;
970 else if (*newtok == '\0') {
971 PyMem_FREE(newtok);
972 tok->done = E_EOF;
973 }
974 else if (tok->start != NULL) {
975 size_t start = tok->start - tok->buf;
976 size_t oldlen = tok->cur - tok->buf;
977 size_t newlen = oldlen + strlen(newtok);
978 char *buf = tok->buf;
979 buf = (char *)PyMem_REALLOC(buf, newlen+1);
980 tok->lineno++;
981 if (buf == NULL) {
982 PyMem_FREE(tok->buf);
983 tok->buf = NULL;
984 PyMem_FREE(newtok);
985 tok->done = E_NOMEM;
986 return EOF;
987 }
988 tok->buf = buf;
989 tok->cur = tok->buf + oldlen;
990 tok->line_start = tok->cur;
991 strcpy(tok->buf + oldlen, newtok);
992 PyMem_FREE(newtok);
993 tok->inp = tok->buf + newlen;
994 tok->end = tok->inp + 1;
995 tok->start = tok->buf + start;
996 }
997 else {
998 tok->lineno++;
999 if (tok->buf != NULL)
1000 PyMem_FREE(tok->buf);
1001 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001002 tok->cur = tok->buf;
1003 tok->line_start = tok->buf;
1004 tok->inp = strchr(tok->buf, '\0');
1005 tok->end = tok->inp + 1;
1006 }
1007 }
1008 else {
1009 int done = 0;
1010 Py_ssize_t cur = 0;
1011 char *pt;
1012 if (tok->start == NULL) {
1013 if (tok->buf == NULL) {
1014 tok->buf = (char *)
1015 PyMem_MALLOC(BUFSIZ);
1016 if (tok->buf == NULL) {
1017 tok->done = E_NOMEM;
1018 return EOF;
1019 }
1020 tok->end = tok->buf + BUFSIZ;
1021 }
1022 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1023 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001024 if (!tok->decoding_erred)
1025 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001026 done = 1;
1027 }
1028 else {
1029 tok->done = E_OK;
1030 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -07001031 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 }
1033 }
1034 else {
1035 cur = tok->cur - tok->buf;
1036 if (decoding_feof(tok)) {
1037 tok->done = E_EOF;
1038 done = 1;
1039 }
1040 else
1041 tok->done = E_OK;
1042 }
1043 tok->lineno++;
1044 /* Read until '\n' or EOF */
1045 while (!done) {
1046 Py_ssize_t curstart = tok->start == NULL ? -1 :
1047 tok->start - tok->buf;
1048 Py_ssize_t curvalid = tok->inp - tok->buf;
1049 Py_ssize_t newsize = curvalid + BUFSIZ;
1050 char *newbuf = tok->buf;
1051 newbuf = (char *)PyMem_REALLOC(newbuf,
1052 newsize);
1053 if (newbuf == NULL) {
1054 tok->done = E_NOMEM;
1055 tok->cur = tok->inp;
1056 return EOF;
1057 }
1058 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001059 tok->cur = tok->buf + cur;
1060 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 tok->inp = tok->buf + curvalid;
1062 tok->end = tok->buf + newsize;
1063 tok->start = curstart < 0 ? NULL :
1064 tok->buf + curstart;
1065 if (decoding_fgets(tok->inp,
1066 (int)(tok->end - tok->inp),
1067 tok) == NULL) {
1068 /* Break out early on decoding
1069 errors, as tok->buf will be NULL
1070 */
1071 if (tok->decoding_erred)
1072 return EOF;
1073 /* Last line does not end in \n,
1074 fake one */
1075 strcpy(tok->inp, "\n");
1076 }
1077 tok->inp = strchr(tok->inp, '\0');
1078 done = tok->inp[-1] == '\n';
1079 }
1080 if (tok->buf != NULL) {
1081 tok->cur = tok->buf + cur;
1082 tok->line_start = tok->cur;
1083 /* replace "\r\n" with "\n" */
1084 /* For Mac leave the \r, giving a syntax error */
1085 pt = tok->inp - 2;
1086 if (pt >= tok->buf && *pt == '\r') {
1087 *pt++ = '\n';
1088 *pt = '\0';
1089 tok->inp = pt;
1090 }
1091 }
1092 }
1093 if (tok->done != E_OK) {
1094 if (tok->prompt != NULL)
1095 PySys_WriteStderr("\n");
1096 tok->cur = tok->inp;
1097 return EOF;
1098 }
1099 }
1100 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101}
1102
1103
1104/* Back-up one character */
1105
1106static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001107tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 if (c != EOF) {
1110 if (--tok->cur < tok->buf)
1111 Py_FatalError("tok_backup: beginning of buffer");
1112 if (*tok->cur != c)
1113 *tok->cur = c;
1114 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001115}
1116
1117
1118/* Return the token corresponding to a single character */
1119
1120int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001121PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 switch (c) {
1124 case '(': return LPAR;
1125 case ')': return RPAR;
1126 case '[': return LSQB;
1127 case ']': return RSQB;
1128 case ':': return COLON;
1129 case ',': return COMMA;
1130 case ';': return SEMI;
1131 case '+': return PLUS;
1132 case '-': return MINUS;
1133 case '*': return STAR;
1134 case '/': return SLASH;
1135 case '|': return VBAR;
1136 case '&': return AMPER;
1137 case '<': return LESS;
1138 case '>': return GREATER;
1139 case '=': return EQUAL;
1140 case '.': return DOT;
1141 case '%': return PERCENT;
1142 case '{': return LBRACE;
1143 case '}': return RBRACE;
1144 case '^': return CIRCUMFLEX;
1145 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001146 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 default: return OP;
1148 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149}
1150
1151
Guido van Rossumfbab9051991-10-20 20:25:03 +00001152int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001153PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001154{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 switch (c1) {
1156 case '=':
1157 switch (c2) {
1158 case '=': return EQEQUAL;
1159 }
1160 break;
1161 case '!':
1162 switch (c2) {
1163 case '=': return NOTEQUAL;
1164 }
1165 break;
1166 case '<':
1167 switch (c2) {
1168 case '>': return NOTEQUAL;
1169 case '=': return LESSEQUAL;
1170 case '<': return LEFTSHIFT;
1171 }
1172 break;
1173 case '>':
1174 switch (c2) {
1175 case '=': return GREATEREQUAL;
1176 case '>': return RIGHTSHIFT;
1177 }
1178 break;
1179 case '+':
1180 switch (c2) {
1181 case '=': return PLUSEQUAL;
1182 }
1183 break;
1184 case '-':
1185 switch (c2) {
1186 case '=': return MINEQUAL;
1187 case '>': return RARROW;
1188 }
1189 break;
1190 case '*':
1191 switch (c2) {
1192 case '*': return DOUBLESTAR;
1193 case '=': return STAREQUAL;
1194 }
1195 break;
1196 case '/':
1197 switch (c2) {
1198 case '/': return DOUBLESLASH;
1199 case '=': return SLASHEQUAL;
1200 }
1201 break;
1202 case '|':
1203 switch (c2) {
1204 case '=': return VBAREQUAL;
1205 }
1206 break;
1207 case '%':
1208 switch (c2) {
1209 case '=': return PERCENTEQUAL;
1210 }
1211 break;
1212 case '&':
1213 switch (c2) {
1214 case '=': return AMPEREQUAL;
1215 }
1216 break;
1217 case '^':
1218 switch (c2) {
1219 case '=': return CIRCUMFLEXEQUAL;
1220 }
1221 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001222 case '@':
1223 switch (c2) {
1224 case '=': return ATEQUAL;
1225 }
1226 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 }
1228 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001229}
1230
Thomas Wouters434d0822000-08-24 20:11:32 +00001231int
1232PyToken_ThreeChars(int c1, int c2, int c3)
1233{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 switch (c1) {
1235 case '<':
1236 switch (c2) {
1237 case '<':
1238 switch (c3) {
1239 case '=':
1240 return LEFTSHIFTEQUAL;
1241 }
1242 break;
1243 }
1244 break;
1245 case '>':
1246 switch (c2) {
1247 case '>':
1248 switch (c3) {
1249 case '=':
1250 return RIGHTSHIFTEQUAL;
1251 }
1252 break;
1253 }
1254 break;
1255 case '*':
1256 switch (c2) {
1257 case '*':
1258 switch (c3) {
1259 case '=':
1260 return DOUBLESTAREQUAL;
1261 }
1262 break;
1263 }
1264 break;
1265 case '/':
1266 switch (c2) {
1267 case '/':
1268 switch (c3) {
1269 case '=':
1270 return DOUBLESLASHEQUAL;
1271 }
1272 break;
1273 }
1274 break;
1275 case '.':
1276 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001277 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 switch (c3) {
1279 case '.':
1280 return ELLIPSIS;
1281 }
1282 break;
1283 }
1284 break;
1285 }
1286 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001287}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001288
Guido van Rossum926f13a1998-04-09 21:38:06 +00001289static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001290indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001291{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 if (tok->alterror) {
1293 tok->done = E_TABSPACE;
1294 tok->cur = tok->inp;
1295 return 1;
1296 }
1297 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001298#ifdef PGEN
1299 PySys_WriteStderr("inconsistent use of tabs and spaces "
1300 "in indentation\n");
1301#else
1302 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001304#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 tok->altwarning = 0;
1306 }
1307 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001308}
1309
Martin v. Löwis47383402007-08-15 07:32:56 +00001310#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001311#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001312#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313/* Verify that the identifier follows PEP 3131.
1314 All identifier strings are guaranteed to be "ready" unicode objects.
1315 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001316static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001317verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001318{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 PyObject *s;
1320 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001321 if (tok->decoding_erred)
1322 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1326 PyErr_Clear();
1327 tok->done = E_IDENTIFIER;
1328 } else {
1329 tok->done = E_ERROR;
1330 }
1331 return 0;
1332 }
1333 result = PyUnicode_IsIdentifier(s);
1334 Py_DECREF(s);
1335 if (result == 0)
1336 tok->done = E_IDENTIFIER;
1337 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001338}
1339#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001340
Brett Cannona721aba2016-09-09 14:57:09 -07001341static int
1342tok_decimal_tail(struct tok_state *tok)
1343{
1344 int c;
1345
1346 while (1) {
1347 do {
1348 c = tok_nextc(tok);
1349 } while (isdigit(c));
1350 if (c != '_') {
1351 break;
1352 }
1353 c = tok_nextc(tok);
1354 if (!isdigit(c)) {
1355 tok->done = E_TOKEN;
1356 tok_backup(tok, c);
1357 return 0;
1358 }
1359 }
1360 return c;
1361}
1362
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363/* Get next token, after space stripping etc. */
1364
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001365static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001366tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001368 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001370
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001372 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 tok->start = NULL;
1374 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001375
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 /* Get indentation level */
1377 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001378 int col = 0;
1379 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 tok->atbol = 0;
1381 for (;;) {
1382 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001383 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001385 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386 else if (c == '\t') {
1387 col = (col/tok->tabsize + 1) * tok->tabsize;
1388 altcol = (altcol/tok->alttabsize + 1)
1389 * tok->alttabsize;
1390 }
Brett Cannona721aba2016-09-09 14:57:09 -07001391 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001393 }
1394 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001396 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 }
1398 tok_backup(tok, c);
1399 if (c == '#' || c == '\n') {
1400 /* Lines with only whitespace and/or comments
1401 shouldn't affect the indentation and are
1402 not passed to the parser as NEWLINE tokens,
1403 except *totally* empty lines in interactive
1404 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001405 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001407 }
1408 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001410 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 /* We can't jump back right here since we still
1412 may need to skip to the end of a comment */
1413 }
1414 if (!blankline && tok->level == 0) {
1415 if (col == tok->indstack[tok->indent]) {
1416 /* No change */
1417 if (altcol != tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001418 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001420 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001421 }
1422 }
1423 else if (col > tok->indstack[tok->indent]) {
1424 /* Indent -- always one */
1425 if (tok->indent+1 >= MAXINDENT) {
1426 tok->done = E_TOODEEP;
1427 tok->cur = tok->inp;
1428 return ERRORTOKEN;
1429 }
1430 if (altcol <= tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001431 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001433 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001434 }
1435 tok->pendin++;
1436 tok->indstack[++tok->indent] = col;
1437 tok->altindstack[tok->indent] = altcol;
1438 }
1439 else /* col < tok->indstack[tok->indent] */ {
1440 /* Dedent -- any number, must be consistent */
1441 while (tok->indent > 0 &&
1442 col < tok->indstack[tok->indent]) {
1443 tok->pendin--;
1444 tok->indent--;
1445 }
1446 if (col != tok->indstack[tok->indent]) {
1447 tok->done = E_DEDENT;
1448 tok->cur = tok->inp;
1449 return ERRORTOKEN;
1450 }
1451 if (altcol != tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001452 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001454 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001455 }
1456 }
1457 }
1458 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 /* Return pending indents/dedents */
1463 if (tok->pendin != 0) {
1464 if (tok->pendin < 0) {
1465 tok->pendin++;
1466 return DEDENT;
1467 }
1468 else {
1469 tok->pendin--;
1470 return INDENT;
1471 }
1472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Yury Selivanov96ec9342015-07-23 15:01:58 +03001474 if (tok->async_def
1475 && !blankline
1476 && tok->level == 0
1477 /* There was a NEWLINE after ASYNC DEF,
1478 so we're past the signature. */
1479 && tok->async_def_nl
1480 /* Current indentation level is less than where
1481 the async function was defined */
1482 && tok->async_def_indent >= tok->indent)
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001483 {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001484 tok->async_def = 0;
1485 tok->async_def_indent = 0;
1486 tok->async_def_nl = 0;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001487 }
1488
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001489 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 tok->start = NULL;
1491 /* Skip spaces */
1492 do {
1493 c = tok_nextc(tok);
1494 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001495
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 /* Set start of current token */
1497 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 /* Skip comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001500 if (c == '#') {
1501 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001502 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001503 }
1504 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001505
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 /* Check for EOF and errors now */
1507 if (c == EOF) {
1508 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1509 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001510
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001511 /* Identifier (most frequent token!) */
1512 nonascii = 0;
1513 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001514 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001515 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001516 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001517 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001518 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001519 /* Since this is a backwards compatibility support literal we don't
1520 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001521 else if (!(saw_b || saw_u || saw_r || saw_f)
1522 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001523 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001524 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001525 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001526 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001527 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001528 }
1529 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001530 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001531 }
1532 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001533 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001534 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001535 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001536 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001538 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 }
1540 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001541 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001543 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001544 c = tok_nextc(tok);
1545 }
1546 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001547 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001548 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001549 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001550 *p_start = tok->start;
1551 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001552
Yury Selivanov96ec9342015-07-23 15:01:58 +03001553 /* async/await parsing block. */
1554 if (tok->cur - tok->start == 5) {
1555 /* Current token length is 5. */
1556 if (tok->async_def) {
1557 /* We're inside an 'async def' function. */
Brett Cannona721aba2016-09-09 14:57:09 -07001558 if (memcmp(tok->start, "async", 5) == 0) {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001559 return ASYNC;
Brett Cannona721aba2016-09-09 14:57:09 -07001560 }
1561 if (memcmp(tok->start, "await", 5) == 0) {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001562 return AWAIT;
Brett Cannona721aba2016-09-09 14:57:09 -07001563 }
Yury Selivanov75445082015-05-11 22:57:16 -04001564 }
Yury Selivanov96ec9342015-07-23 15:01:58 +03001565 else if (memcmp(tok->start, "async", 5) == 0) {
1566 /* The current token is 'async'.
1567 Look ahead one token.*/
Yury Selivanov8085b802015-05-18 12:50:52 -04001568
Yury Selivanov96ec9342015-07-23 15:01:58 +03001569 struct tok_state ahead_tok;
1570 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1571 int ahead_tok_kind;
Yury Selivanov8085b802015-05-18 12:50:52 -04001572
Yury Selivanov75445082015-05-11 22:57:16 -04001573 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
Yury Selivanov75445082015-05-11 22:57:16 -04001574 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
Yury Selivanov96ec9342015-07-23 15:01:58 +03001575 &ahead_tok_end);
Yury Selivanov75445082015-05-11 22:57:16 -04001576
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001577 if (ahead_tok_kind == NAME
1578 && ahead_tok.cur - ahead_tok.start == 3
1579 && memcmp(ahead_tok.start, "def", 3) == 0)
1580 {
1581 /* The next token is going to be 'def', so instead of
1582 returning 'async' NAME token, we return ASYNC. */
Yury Selivanov96ec9342015-07-23 15:01:58 +03001583 tok->async_def_indent = tok->indent;
1584 tok->async_def = 1;
Yury Selivanov75445082015-05-11 22:57:16 -04001585 return ASYNC;
1586 }
Yury Selivanov75445082015-05-11 22:57:16 -04001587 }
1588 }
1589
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 return NAME;
1591 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001592
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 /* Newline */
1594 if (c == '\n') {
1595 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001596 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001597 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001598 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001599 *p_start = tok->start;
1600 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1601 tok->cont_line = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +03001602 if (tok->async_def) {
1603 /* We're somewhere inside an 'async def' function, and
1604 we've encountered a NEWLINE after its signature. */
1605 tok->async_def_nl = 1;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001606 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 return NEWLINE;
1608 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 /* Period or number starting with period? */
1611 if (c == '.') {
1612 c = tok_nextc(tok);
1613 if (isdigit(c)) {
1614 goto fraction;
1615 } else if (c == '.') {
1616 c = tok_nextc(tok);
1617 if (c == '.') {
1618 *p_start = tok->start;
1619 *p_end = tok->cur;
1620 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001621 }
1622 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 tok_backup(tok, c);
1624 }
1625 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001626 }
1627 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 tok_backup(tok, c);
1629 }
1630 *p_start = tok->start;
1631 *p_end = tok->cur;
1632 return DOT;
1633 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001634
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 /* Number */
1636 if (isdigit(c)) {
1637 if (c == '0') {
1638 /* Hex, octal or binary -- maybe. */
1639 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001640 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 /* Hex */
1642 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001644 if (c == '_') {
1645 c = tok_nextc(tok);
1646 }
1647 if (!isxdigit(c)) {
1648 tok->done = E_TOKEN;
1649 tok_backup(tok, c);
1650 return ERRORTOKEN;
1651 }
1652 do {
1653 c = tok_nextc(tok);
1654 } while (isxdigit(c));
1655 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001656 }
1657 else if (c == 'o' || c == 'O') {
1658 /* Octal */
1659 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001661 if (c == '_') {
1662 c = tok_nextc(tok);
1663 }
1664 if (c < '0' || c >= '8') {
1665 tok->done = E_TOKEN;
1666 tok_backup(tok, c);
1667 return ERRORTOKEN;
1668 }
1669 do {
1670 c = tok_nextc(tok);
1671 } while ('0' <= c && c < '8');
1672 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 }
1674 else if (c == 'b' || c == 'B') {
1675 /* Binary */
1676 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001678 if (c == '_') {
1679 c = tok_nextc(tok);
1680 }
1681 if (c != '0' && c != '1') {
1682 tok->done = E_TOKEN;
1683 tok_backup(tok, c);
1684 return ERRORTOKEN;
1685 }
1686 do {
1687 c = tok_nextc(tok);
1688 } while (c == '0' || c == '1');
1689 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690 }
1691 else {
1692 int nonzero = 0;
1693 /* maybe old-style octal; c is first char of it */
1694 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001695 while (1) {
1696 if (c == '_') {
1697 c = tok_nextc(tok);
1698 if (!isdigit(c)) {
1699 tok->done = E_TOKEN;
1700 tok_backup(tok, c);
1701 return ERRORTOKEN;
1702 }
1703 }
1704 if (c != '0') {
1705 break;
1706 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 c = tok_nextc(tok);
1708 }
Brett Cannona721aba2016-09-09 14:57:09 -07001709 if (isdigit(c)) {
1710 nonzero = 1;
1711 c = tok_decimal_tail(tok);
1712 if (c == 0) {
1713 return ERRORTOKEN;
1714 }
1715 }
1716 if (c == '.') {
1717 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001719 }
1720 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001722 }
1723 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001725 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001726 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001727 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 tok->done = E_TOKEN;
1729 tok_backup(tok, c);
1730 return ERRORTOKEN;
1731 }
1732 }
1733 }
1734 else {
1735 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001736 c = tok_decimal_tail(tok);
1737 if (c == 0) {
1738 return ERRORTOKEN;
1739 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 {
1741 /* Accept floating point numbers. */
1742 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001743 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 fraction:
1745 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001746 if (isdigit(c)) {
1747 c = tok_decimal_tail(tok);
1748 if (c == 0) {
1749 return ERRORTOKEN;
1750 }
1751 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 }
1753 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001754 int e;
1755 exponent:
1756 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001757 /* Exponent part */
1758 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001759 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001760 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001761 if (!isdigit(c)) {
1762 tok->done = E_TOKEN;
1763 tok_backup(tok, c);
1764 return ERRORTOKEN;
1765 }
1766 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001767 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001768 tok_backup(tok, e);
1769 *p_start = tok->start;
1770 *p_end = tok->cur;
1771 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 }
Brett Cannona721aba2016-09-09 14:57:09 -07001773 c = tok_decimal_tail(tok);
1774 if (c == 0) {
1775 return ERRORTOKEN;
1776 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 }
Brett Cannona721aba2016-09-09 14:57:09 -07001778 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 /* Imaginary part */
1780 imaginary:
1781 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001782 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 }
1784 }
1785 tok_backup(tok, c);
1786 *p_start = tok->start;
1787 *p_end = tok->cur;
1788 return NUMBER;
1789 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001790
1791 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001792 /* String */
1793 if (c == '\'' || c == '"') {
1794 int quote = c;
1795 int quote_size = 1; /* 1 or 3 */
1796 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001797
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001798 /* Find the quote size and start of string */
1799 c = tok_nextc(tok);
1800 if (c == quote) {
1801 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001802 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001803 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001804 }
1805 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001807 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001808 }
Brett Cannona721aba2016-09-09 14:57:09 -07001809 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001810 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001811 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001812
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001813 /* Get rest of string */
1814 while (end_quote_size != quote_size) {
1815 c = tok_nextc(tok);
1816 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001817 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001818 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001819 }
1820 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001821 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001822 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001823 tok->cur = tok->inp;
1824 return ERRORTOKEN;
1825 }
1826 if (quote_size == 1 && c == '\n') {
1827 tok->done = E_EOLS;
1828 tok->cur = tok->inp;
1829 return ERRORTOKEN;
1830 }
Brett Cannona721aba2016-09-09 14:57:09 -07001831 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001832 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001833 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001834 else {
1835 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001836 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001837 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001838 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 }
1840 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001841
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001842 *p_start = tok->start;
1843 *p_end = tok->cur;
1844 return STRING;
1845 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001846
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001847 /* Line continuation */
1848 if (c == '\\') {
1849 c = tok_nextc(tok);
1850 if (c != '\n') {
1851 tok->done = E_LINECONT;
1852 tok->cur = tok->inp;
1853 return ERRORTOKEN;
1854 }
1855 tok->cont_line = 1;
1856 goto again; /* Read next line */
1857 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001858
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001859 /* Check for two-character token */
1860 {
1861 int c2 = tok_nextc(tok);
1862 int token = PyToken_TwoChars(c, c2);
1863 if (token != OP) {
1864 int c3 = tok_nextc(tok);
1865 int token3 = PyToken_ThreeChars(c, c2, c3);
1866 if (token3 != OP) {
1867 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001868 }
1869 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001870 tok_backup(tok, c3);
1871 }
1872 *p_start = tok->start;
1873 *p_end = tok->cur;
1874 return token;
1875 }
1876 tok_backup(tok, c2);
1877 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001878
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001879 /* Keep track of parentheses nesting level */
1880 switch (c) {
1881 case '(':
1882 case '[':
1883 case '{':
1884 tok->level++;
1885 break;
1886 case ')':
1887 case ']':
1888 case '}':
1889 tok->level--;
1890 break;
1891 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001892
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001893 /* Punctuation character */
1894 *p_start = tok->start;
1895 *p_end = tok->cur;
1896 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001897}
1898
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001899int
1900PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1901{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001902 int result = tok_get(tok, p_start, p_end);
1903 if (tok->decoding_erred) {
1904 result = ERRORTOKEN;
1905 tok->done = E_DECODE;
1906 }
1907 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001908}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001909
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001910/* Get the encoding of a Python file. Check for the coding cookie and check if
1911 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001912
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001913 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1914 encoding in the first or second line of the file (in which case the encoding
1915 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001916
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001917 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1918 by the caller. */
1919
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001920char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001921PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001922{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001923 struct tok_state *tok;
1924 FILE *fp;
1925 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001926
Victor Stinnerdaf45552013-08-28 00:53:59 +02001927#ifndef PGEN
1928 fd = _Py_dup(fd);
1929#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001930 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001931#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001932 if (fd < 0) {
1933 return NULL;
1934 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001935
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001936 fp = fdopen(fd, "r");
1937 if (fp == NULL) {
1938 return NULL;
1939 }
1940 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1941 if (tok == NULL) {
1942 fclose(fp);
1943 return NULL;
1944 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001945#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001946 if (filename != NULL) {
1947 Py_INCREF(filename);
1948 tok->filename = filename;
1949 }
1950 else {
1951 tok->filename = PyUnicode_FromString("<string>");
1952 if (tok->filename == NULL) {
1953 fclose(fp);
1954 PyTokenizer_Free(tok);
1955 return encoding;
1956 }
1957 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001958#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001959 while (tok->lineno < 2 && tok->done == E_OK) {
1960 PyTokenizer_Get(tok, &p_start, &p_end);
1961 }
1962 fclose(fp);
1963 if (tok->encoding) {
1964 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1965 if (encoding)
1966 strcpy(encoding, tok->encoding);
1967 }
1968 PyTokenizer_Free(tok);
1969 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001970}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001971
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001972char *
1973PyTokenizer_FindEncoding(int fd)
1974{
1975 return PyTokenizer_FindEncodingFilename(fd, NULL);
1976}
1977
Guido van Rossum408027e1996-12-30 16:17:54 +00001978#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001979
1980void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001981tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001982{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001983 printf("%s", _PyParser_TokenNames[type]);
1984 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1985 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001986}
1987
1988#endif