blob: ff65f2a735903c2d6d90391ac772b4ce881d60f0 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400101 "ATEQUAL",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000102 "RARROW",
103 "ELLIPSIS",
104 /* This table must match the #defines in token.h! */
105 "OP",
Yury Selivanov75445082015-05-11 22:57:16 -0400106 "AWAIT",
107 "ASYNC",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 "<ERRORTOKEN>",
109 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000116tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119 sizeof(struct tok_state));
120 if (tok == NULL)
121 return NULL;
122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123 tok->done = E_OK;
124 tok->fp = NULL;
125 tok->input = NULL;
126 tok->tabsize = TABSIZE;
127 tok->indent = 0;
128 tok->indstack[0] = 0;
Yury Selivanov75445082015-05-11 22:57:16 -0400129
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
134 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 tok->altwarning = 1;
136 tok->alterror = 1;
137 tok->alttabsize = 1;
138 tok->altindstack[0] = 0;
139 tok->decoding_state = STATE_INIT;
140 tok->decoding_erred = 0;
141 tok->read_coding_spec = 0;
142 tok->enc = NULL;
143 tok->encoding = NULL;
144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200146 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 tok->decoding_readline = NULL;
148 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000149#endif
Yury Selivanov96ec9342015-07-23 15:01:58 +0300150
151 tok->async_def = 0;
152 tok->async_def_indent = 0;
153 tok->async_def_nl = 0;
154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000156}
157
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700159new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700162 if (!result) {
163 tok->done = E_NOMEM;
164 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700166 memcpy(result, s, len);
167 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000169}
170
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000171#ifdef PGEN
172
173static char *
174decoding_fgets(char *s, int size, struct tok_state *tok)
175{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177}
178
179static int
180decoding_feof(struct tok_state *tok)
181{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183}
184
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000185static char *
186decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000187{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700188 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189}
190
191#else /* PGEN */
192
193static char *
194error_ret(struct tok_state *tok) /* XXX */
195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 tok->decoding_erred = 1;
197 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198 PyMem_FREE(tok->buf);
Serhiy Storchaka0d441112015-11-14 15:10:35 +0200199 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200 tok->done = E_DECODE;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202}
203
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200205static const char *
206get_normal_name(const char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000207{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000208 char buf[13];
209 int i;
210 for (i = 0; i < 12; i++) {
211 int c = s[i];
212 if (c == '\0')
213 break;
214 else if (c == '_')
215 buf[i] = '-';
216 else
217 buf[i] = tolower(c);
218 }
219 buf[i] = '\0';
220 if (strcmp(buf, "utf-8") == 0 ||
221 strncmp(buf, "utf-8-", 6) == 0)
222 return "utf-8";
223 else if (strcmp(buf, "latin-1") == 0 ||
224 strcmp(buf, "iso-8859-1") == 0 ||
225 strcmp(buf, "iso-latin-1") == 0 ||
226 strncmp(buf, "latin-1-", 8) == 0 ||
227 strncmp(buf, "iso-8859-1-", 11) == 0 ||
228 strncmp(buf, "iso-latin-1-", 12) == 0)
229 return "iso-8859-1";
230 else
231 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000232}
233
234/* Return the coding spec in S, or NULL if none is found. */
235
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700236static int
237get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000238{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700240 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 /* Coding spec must be in a comment, and that comment must be
242 * the only statement on the source code line. */
243 for (i = 0; i < size - 6; i++) {
244 if (s[i] == '#')
245 break;
246 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700247 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 }
249 for (; i < size - 6; i++) { /* XXX inefficient search */
250 const char* t = s + i;
251 if (strncmp(t, "coding", 6) == 0) {
252 const char* begin = NULL;
253 t += 6;
254 if (t[0] != ':' && t[0] != '=')
255 continue;
256 do {
257 t++;
258 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 begin = t;
261 while (Py_ISALNUM(t[0]) ||
262 t[0] == '-' || t[0] == '_' || t[0] == '.')
263 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700266 char* r = new_string(begin, t - begin, tok);
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200267 const char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700268 if (!r)
269 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700270 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 if (r != q) {
272 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700273 r = new_string(q, strlen(q), tok);
274 if (!r)
275 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700277 *spec = r;
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200278 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 }
280 }
281 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700282 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283}
284
285/* Check whether the line contains a coding spec. If it does,
286 invoke the set_readline function for the new encoding.
287 This function receives the tok_state and the new encoding.
288 Return 1 on success, 0 on failure. */
289
290static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000291check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700294 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000295 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000296
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200297 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200299 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000300 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200301 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700302 if (!get_coding_spec(line, &cs, size, tok))
303 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200304 if (!cs) {
305 Py_ssize_t i;
306 for (i = 0; i < size; i++) {
307 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
308 break;
309 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310 /* Stop checking coding spec after a line containing
311 * anything except a comment. */
312 tok->read_coding_spec = 1;
313 break;
314 }
315 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 tok->read_coding_spec = 1;
319 if (tok->encoding == NULL) {
320 assert(tok->decoding_state == STATE_RAW);
321 if (strcmp(cs, "utf-8") == 0) {
322 tok->encoding = cs;
323 } else {
324 r = set_readline(tok, cs);
325 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700327 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000328 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700329 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300330 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700331 "encoding problem: %s", cs);
332 PyMem_FREE(cs);
333 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700335 } else { /* then, compare cs with BOM */
336 r = (strcmp(tok->encoding, cs) == 0);
337 if (!r)
338 PyErr_Format(PyExc_SyntaxError,
339 "encoding problem: %s with BOM", cs);
340 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000342 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343}
344
345/* See whether the file starts with a BOM. If it does,
346 invoke the set_readline function with the new encoding.
347 Return 1 on success, 0 on failure. */
348
349static int
350check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 void unget_char(int, struct tok_state *),
352 int set_readline(struct tok_state *, const char *),
353 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000355 int ch1, ch2, ch3;
356 ch1 = get_char(tok);
357 tok->decoding_state = STATE_RAW;
358 if (ch1 == EOF) {
359 return 1;
360 } else if (ch1 == 0xEF) {
361 ch2 = get_char(tok);
362 if (ch2 != 0xBB) {
363 unget_char(ch2, tok);
364 unget_char(ch1, tok);
365 return 1;
366 }
367 ch3 = get_char(tok);
368 if (ch3 != 0xBF) {
369 unget_char(ch3, tok);
370 unget_char(ch2, tok);
371 unget_char(ch1, tok);
372 return 1;
373 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000374#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 /* Disable support for UTF-16 BOMs until a decision
376 is made whether this needs to be supported. */
377 } else if (ch1 == 0xFE) {
378 ch2 = get_char(tok);
379 if (ch2 != 0xFF) {
380 unget_char(ch2, tok);
381 unget_char(ch1, tok);
382 return 1;
383 }
384 if (!set_readline(tok, "utf-16-be"))
385 return 0;
386 tok->decoding_state = STATE_NORMAL;
387 } else if (ch1 == 0xFF) {
388 ch2 = get_char(tok);
389 if (ch2 != 0xFE) {
390 unget_char(ch2, tok);
391 unget_char(ch1, tok);
392 return 1;
393 }
394 if (!set_readline(tok, "utf-16-le"))
395 return 0;
396 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 } else {
399 unget_char(ch1, tok);
400 return 1;
401 }
402 if (tok->encoding != NULL)
403 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700404 tok->encoding = new_string("utf-8", 5, tok);
405 if (!tok->encoding)
406 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 /* No need to set_readline: input is already utf-8 */
408 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000412 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000413
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000414 On entry, tok->decoding_buffer will be one of:
415 1) NULL: need to call tok->decoding_readline to get a new line
416 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000418 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 (in the s buffer) to copy entire contents of the line read
420 by tok->decoding_readline. tok->decoding_buffer has the overflow.
421 In this case, fp_readl is called in a loop (with an expanded buffer)
422 until the buffer ends with a '\n' (or until the end of the file is
423 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426static char *
427fp_readl(char *s, int size, struct tok_state *tok)
428{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000429 PyObject* bufobj;
430 const char *buf;
431 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000433 /* Ask for one less byte so we can terminate it */
434 assert(size > 0);
435 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000436
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000437 if (tok->decoding_buffer) {
438 bufobj = tok->decoding_buffer;
439 Py_INCREF(bufobj);
440 }
441 else
442 {
443 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
444 if (bufobj == NULL)
445 goto error;
446 }
447 if (PyUnicode_CheckExact(bufobj))
448 {
Serhiy Storchaka06515832016-11-20 09:13:07 +0200449 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000450 if (buf == NULL) {
451 goto error;
452 }
453 }
454 else
455 {
456 buf = PyByteArray_AsString(bufobj);
457 if (buf == NULL) {
458 goto error;
459 }
460 buflen = PyByteArray_GET_SIZE(bufobj);
461 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000462
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 Py_XDECREF(tok->decoding_buffer);
464 if (buflen > size) {
465 /* Too many chars, the rest goes into tok->decoding_buffer */
466 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467 buflen-size);
468 if (tok->decoding_buffer == NULL)
469 goto error;
470 buflen = size;
471 }
472 else
473 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000474
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 memcpy(s, buf, buflen);
476 s[buflen] = '\0';
477 if (buflen == 0) /* EOF */
478 s = NULL;
479 Py_DECREF(bufobj);
480 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000481
482error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000483 Py_XDECREF(bufobj);
484 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485}
486
487/* Set the readline function for TOK to a StreamReader's
488 readline function. The StreamReader is named ENC.
489
490 This function is called from check_bom and check_coding_spec.
491
492 ENC is usually identical to the future value of tok->encoding,
493 except for the (currently unsupported) case of UTF-16.
494
495 Return 1 on success, 0 on failure. */
496
497static int
498fp_setreadl(struct tok_state *tok, const char* enc)
499{
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700500 PyObject *readline, *io, *stream;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200501 _Py_IDENTIFIER(open);
502 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000503 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200504 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505
Victor Stinner22a351a2010-10-14 12:04:34 +0000506 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200507 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100508 * position of tok->fp. If tok->fp was opened in text mode on Windows,
509 * its file position counts CRLF as one char and can't be directly mapped
510 * to the file offset for fd. Instead we step back one byte and read to
511 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200512 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100513 if (pos == -1 ||
514 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000515 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700516 return 0;
Victor Stinner22a351a2010-10-14 12:04:34 +0000517 }
518
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700519 io = PyImport_ImportModuleNoBlock("io");
520 if (io == NULL)
521 return 0;
522
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200523 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000524 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700525 Py_DECREF(io);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 if (stream == NULL)
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700527 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200529 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700530 Py_DECREF(stream);
531 if (readline == NULL)
532 return 0;
Serhiy Storchaka48842712016-04-06 09:45:48 +0300533 Py_XSETREF(tok->decoding_readline, readline);
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700534
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100535 if (pos > 0) {
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700536 PyObject *bufobj = PyObject_CallObject(readline, NULL);
537 if (bufobj == NULL)
538 return 0;
539 Py_DECREF(bufobj);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100540 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000541
Benjamin Peterson35ee9482016-09-12 22:06:58 -0700542 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
545/* Fetch the next byte from TOK. */
546
547static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000548 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549}
550
551/* Unfetch the last byte back into TOK. */
552
553static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000554 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555}
556
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000557/* Check whether the characters at s start a valid
558 UTF-8 sequence. Return the number of characters forming
559 the sequence if yes, 0 if not. */
560static int valid_utf8(const unsigned char* s)
561{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000562 int expected = 0;
563 int length;
564 if (*s < 0x80)
565 /* single-byte code */
566 return 1;
567 if (*s < 0xc0)
568 /* following byte */
569 return 0;
570 if (*s < 0xE0)
571 expected = 1;
572 else if (*s < 0xF0)
573 expected = 2;
574 else if (*s < 0xF8)
575 expected = 3;
576 else
577 return 0;
578 length = expected + 1;
579 for (; expected; expected--)
580 if (s[expected] < 0x80 || s[expected] >= 0xC0)
581 return 0;
582 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000583}
584
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585/* Read a line of input from TOK. Determine encoding
586 if necessary. */
587
588static char *
589decoding_fgets(char *s, int size, struct tok_state *tok)
590{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000591 char *line = NULL;
592 int badchar = 0;
593 for (;;) {
594 if (tok->decoding_state == STATE_NORMAL) {
595 /* We already have a codec associated with
596 this input. */
597 line = fp_readl(s, size, tok);
598 break;
599 } else if (tok->decoding_state == STATE_RAW) {
600 /* We want a 'raw' read. */
601 line = Py_UniversalNewlineFgets(s, size,
602 tok->fp, NULL);
603 break;
604 } else {
605 /* We have not yet determined the encoding.
606 If an encoding is found, use the file-pointer
607 reader functions from now on. */
608 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
609 return error_ret(tok);
610 assert(tok->decoding_state != STATE_INIT);
611 }
612 }
613 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
614 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
615 return error_ret(tok);
616 }
617 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000619 /* The default encoding is UTF-8, so make sure we don't have any
620 non-UTF-8 sequences in it. */
621 if (line && !tok->encoding) {
622 unsigned char *c;
623 int length;
624 for (c = (unsigned char *)line; *c; c += length)
625 if (!(length = valid_utf8(c))) {
626 badchar = *c;
627 break;
628 }
629 }
630 if (badchar) {
631 /* Need to add 1 to the line number, since this line
632 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200633 PyErr_Format(PyExc_SyntaxError,
634 "Non-UTF-8 code starting with '\\x%.2x' "
635 "in file %U on line %i, "
636 "but no encoding declared; "
637 "see http://python.org/dev/peps/pep-0263/ for details",
638 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 return error_ret(tok);
640 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643}
644
645static int
646decoding_feof(struct tok_state *tok)
647{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 if (tok->decoding_state != STATE_NORMAL) {
649 return feof(tok->fp);
650 } else {
651 PyObject* buf = tok->decoding_buffer;
652 if (buf == NULL) {
653 buf = PyObject_CallObject(tok->decoding_readline, NULL);
654 if (buf == NULL) {
655 error_ret(tok);
656 return 1;
657 } else {
658 tok->decoding_buffer = buf;
659 }
660 }
661 return PyObject_Length(buf) == 0;
662 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663}
664
665/* Fetch a byte from TOK, using the string buffer. */
666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000667static int
668buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670}
671
672/* Unfetch a byte from TOK, using the string buffer. */
673
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000674static void
675buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 tok->str--;
677 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000678}
679
680/* Set the readline function for TOK to ENC. For the string-based
681 tokenizer, this means to just record the encoding. */
682
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000683static int
684buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000685 tok->enc = enc;
686 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000687}
688
689/* Return a UTF-8 encoding Python string object from the
690 C byte string STR, which is encoded with ENC. */
691
692static PyObject *
693translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 PyObject *utf8;
695 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
696 if (buf == NULL)
697 return NULL;
698 utf8 = PyUnicode_AsUTF8String(buf);
699 Py_DECREF(buf);
700 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000701}
702
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000703
704static char *
705translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200706 int skip_next_lf = 0;
707 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 char *buf, *current;
709 char c = '\0';
710 buf = PyMem_MALLOC(needed_length);
711 if (buf == NULL) {
712 tok->done = E_NOMEM;
713 return NULL;
714 }
715 for (current = buf; *s; s++, current++) {
716 c = *s;
717 if (skip_next_lf) {
718 skip_next_lf = 0;
719 if (c == '\n') {
720 c = *++s;
721 if (!c)
722 break;
723 }
724 }
725 if (c == '\r') {
726 skip_next_lf = 1;
727 c = '\n';
728 }
729 *current = c;
730 }
731 /* If this is exec input, add a newline to the end of the string if
732 there isn't one already. */
733 if (exec_input && c != '\n') {
734 *current = '\n';
735 current++;
736 }
737 *current = '\0';
738 final_length = current - buf + 1;
739 if (final_length < needed_length && final_length)
740 /* should never fail */
741 buf = PyMem_REALLOC(buf, final_length);
742 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000743}
744
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000745/* Decode a byte string STR for use as the buffer of TOK.
746 Look for encoding declarations inside STR, and record them
747 inside TOK. */
748
749static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000750decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000751{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 PyObject* utf8 = NULL;
753 const char *str;
754 const char *s;
755 const char *newl[2] = {NULL, NULL};
756 int lineno = 0;
757 tok->input = str = translate_newlines(input, single, tok);
758 if (str == NULL)
759 return NULL;
760 tok->enc = NULL;
761 tok->str = str;
762 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
763 return error_ret(tok);
764 str = tok->str; /* string after BOM if any */
765 assert(str);
766 if (tok->enc != NULL) {
767 utf8 = translate_into_utf8(str, tok->enc);
768 if (utf8 == NULL)
769 return error_ret(tok);
770 str = PyBytes_AsString(utf8);
771 }
772 for (s = str;; s++) {
773 if (*s == '\0') break;
774 else if (*s == '\n') {
775 assert(lineno < 2);
776 newl[lineno] = s;
777 lineno++;
778 if (lineno == 2) break;
779 }
780 }
781 tok->enc = NULL;
782 /* need to check line 1 and 2 separately since check_coding_spec
783 assumes a single line as input */
784 if (newl[0]) {
785 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
786 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200787 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
789 tok, buf_setreadl))
790 return error_ret(tok);
791 }
792 }
793 if (tok->enc != NULL) {
794 assert(utf8 == NULL);
795 utf8 = translate_into_utf8(str, tok->enc);
796 if (utf8 == NULL)
797 return error_ret(tok);
798 str = PyBytes_AS_STRING(utf8);
799 }
800 assert(tok->decoding_buffer == NULL);
801 tok->decoding_buffer = utf8; /* CAUTION */
802 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000803}
804
805#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000806
807/* Set up tokenizer for string */
808
809struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000810PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 struct tok_state *tok = tok_new();
813 if (tok == NULL)
814 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300815 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 if (str == NULL) {
817 PyTokenizer_Free(tok);
818 return NULL;
819 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000820
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 /* XXX: constify members. */
822 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824}
825
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000826struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000827PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000828{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 struct tok_state *tok = tok_new();
830 if (tok == NULL)
831 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000832#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000834#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 if (str == NULL) {
836 PyTokenizer_Free(tok);
837 return NULL;
838 }
839 tok->decoding_state = STATE_RAW;
840 tok->read_coding_spec = 1;
841 tok->enc = NULL;
842 tok->str = str;
843 tok->encoding = (char *)PyMem_MALLOC(6);
844 if (!tok->encoding) {
845 PyTokenizer_Free(tok);
846 return NULL;
847 }
848 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000849
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 /* XXX: constify members. */
851 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
852 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000853}
854
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000855/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856
857struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300858PyTokenizer_FromFile(FILE *fp, const char* enc,
859 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 struct tok_state *tok = tok_new();
862 if (tok == NULL)
863 return NULL;
864 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
865 PyTokenizer_Free(tok);
866 return NULL;
867 }
868 tok->cur = tok->inp = tok->buf;
869 tok->end = tok->buf + BUFSIZ;
870 tok->fp = fp;
871 tok->prompt = ps1;
872 tok->nextprompt = ps2;
873 if (enc != NULL) {
874 /* Must copy encoding declaration since it
875 gets copied into the parse tree. */
876 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
877 if (!tok->encoding) {
878 PyTokenizer_Free(tok);
879 return NULL;
880 }
881 strcpy(tok->encoding, enc);
882 tok->decoding_state = STATE_NORMAL;
883 }
884 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000885}
886
887
888/* Free a tok_state structure */
889
890void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000891PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000892{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000893 if (tok->encoding != NULL)
894 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000895#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 Py_XDECREF(tok->decoding_readline);
897 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200898 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000899#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 if (tok->fp != NULL && tok->buf != NULL)
901 PyMem_FREE(tok->buf);
902 if (tok->input)
903 PyMem_FREE((char *)tok->input);
904 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905}
906
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907/* Get next char, updating state; error code goes into tok->done */
908
909static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200910tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000911{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000912 for (;;) {
913 if (tok->cur != tok->inp) {
914 return Py_CHARMASK(*tok->cur++); /* Fast path */
915 }
916 if (tok->done != E_OK)
917 return EOF;
918 if (tok->fp == NULL) {
919 char *end = strchr(tok->inp, '\n');
920 if (end != NULL)
921 end++;
922 else {
923 end = strchr(tok->inp, '\0');
924 if (end == tok->inp) {
925 tok->done = E_EOF;
926 return EOF;
927 }
928 }
929 if (tok->start == NULL)
930 tok->buf = tok->cur;
931 tok->line_start = tok->cur;
932 tok->lineno++;
933 tok->inp = end;
934 return Py_CHARMASK(*tok->cur++);
935 }
936 if (tok->prompt != NULL) {
937 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000938#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000939 if (newtok != NULL) {
940 char *translated = translate_newlines(newtok, 0, tok);
941 PyMem_FREE(newtok);
942 if (translated == NULL)
943 return EOF;
944 newtok = translated;
945 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 if (tok->encoding && newtok && *newtok) {
947 /* Recode to UTF-8 */
948 Py_ssize_t buflen;
949 const char* buf;
950 PyObject *u = translate_into_utf8(newtok, tok->encoding);
951 PyMem_FREE(newtok);
952 if (!u) {
953 tok->done = E_DECODE;
954 return EOF;
955 }
956 buflen = PyBytes_GET_SIZE(u);
957 buf = PyBytes_AS_STRING(u);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000958 newtok = PyMem_MALLOC(buflen+1);
959 strcpy(newtok, buf);
960 Py_DECREF(u);
961 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000962#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 if (tok->nextprompt != NULL)
964 tok->prompt = tok->nextprompt;
965 if (newtok == NULL)
966 tok->done = E_INTR;
967 else if (*newtok == '\0') {
968 PyMem_FREE(newtok);
969 tok->done = E_EOF;
970 }
971 else if (tok->start != NULL) {
972 size_t start = tok->start - tok->buf;
973 size_t oldlen = tok->cur - tok->buf;
974 size_t newlen = oldlen + strlen(newtok);
975 char *buf = tok->buf;
976 buf = (char *)PyMem_REALLOC(buf, newlen+1);
977 tok->lineno++;
978 if (buf == NULL) {
979 PyMem_FREE(tok->buf);
980 tok->buf = NULL;
981 PyMem_FREE(newtok);
982 tok->done = E_NOMEM;
983 return EOF;
984 }
985 tok->buf = buf;
986 tok->cur = tok->buf + oldlen;
987 tok->line_start = tok->cur;
988 strcpy(tok->buf + oldlen, newtok);
989 PyMem_FREE(newtok);
990 tok->inp = tok->buf + newlen;
991 tok->end = tok->inp + 1;
992 tok->start = tok->buf + start;
993 }
994 else {
995 tok->lineno++;
996 if (tok->buf != NULL)
997 PyMem_FREE(tok->buf);
998 tok->buf = newtok;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 tok->cur = tok->buf;
1000 tok->line_start = tok->buf;
1001 tok->inp = strchr(tok->buf, '\0');
1002 tok->end = tok->inp + 1;
1003 }
1004 }
1005 else {
1006 int done = 0;
1007 Py_ssize_t cur = 0;
1008 char *pt;
1009 if (tok->start == NULL) {
1010 if (tok->buf == NULL) {
1011 tok->buf = (char *)
1012 PyMem_MALLOC(BUFSIZ);
1013 if (tok->buf == NULL) {
1014 tok->done = E_NOMEM;
1015 return EOF;
1016 }
1017 tok->end = tok->buf + BUFSIZ;
1018 }
1019 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1020 tok) == NULL) {
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001021 if (!tok->decoding_erred)
1022 tok->done = E_EOF;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 done = 1;
1024 }
1025 else {
1026 tok->done = E_OK;
1027 tok->inp = strchr(tok->buf, '\0');
Benjamin Peterson26d998c2016-09-18 23:41:11 -07001028 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 }
1030 }
1031 else {
1032 cur = tok->cur - tok->buf;
1033 if (decoding_feof(tok)) {
1034 tok->done = E_EOF;
1035 done = 1;
1036 }
1037 else
1038 tok->done = E_OK;
1039 }
1040 tok->lineno++;
1041 /* Read until '\n' or EOF */
1042 while (!done) {
1043 Py_ssize_t curstart = tok->start == NULL ? -1 :
1044 tok->start - tok->buf;
1045 Py_ssize_t curvalid = tok->inp - tok->buf;
1046 Py_ssize_t newsize = curvalid + BUFSIZ;
1047 char *newbuf = tok->buf;
1048 newbuf = (char *)PyMem_REALLOC(newbuf,
1049 newsize);
1050 if (newbuf == NULL) {
1051 tok->done = E_NOMEM;
1052 tok->cur = tok->inp;
1053 return EOF;
1054 }
1055 tok->buf = newbuf;
Serhiy Storchaka0d441112015-11-14 15:10:35 +02001056 tok->cur = tok->buf + cur;
1057 tok->line_start = tok->cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001058 tok->inp = tok->buf + curvalid;
1059 tok->end = tok->buf + newsize;
1060 tok->start = curstart < 0 ? NULL :
1061 tok->buf + curstart;
1062 if (decoding_fgets(tok->inp,
1063 (int)(tok->end - tok->inp),
1064 tok) == NULL) {
1065 /* Break out early on decoding
1066 errors, as tok->buf will be NULL
1067 */
1068 if (tok->decoding_erred)
1069 return EOF;
1070 /* Last line does not end in \n,
1071 fake one */
1072 strcpy(tok->inp, "\n");
1073 }
1074 tok->inp = strchr(tok->inp, '\0');
1075 done = tok->inp[-1] == '\n';
1076 }
1077 if (tok->buf != NULL) {
1078 tok->cur = tok->buf + cur;
1079 tok->line_start = tok->cur;
1080 /* replace "\r\n" with "\n" */
1081 /* For Mac leave the \r, giving a syntax error */
1082 pt = tok->inp - 2;
1083 if (pt >= tok->buf && *pt == '\r') {
1084 *pt++ = '\n';
1085 *pt = '\0';
1086 tok->inp = pt;
1087 }
1088 }
1089 }
1090 if (tok->done != E_OK) {
1091 if (tok->prompt != NULL)
1092 PySys_WriteStderr("\n");
1093 tok->cur = tok->inp;
1094 return EOF;
1095 }
1096 }
1097 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098}
1099
1100
1101/* Back-up one character */
1102
1103static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001104tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001105{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 if (c != EOF) {
1107 if (--tok->cur < tok->buf)
1108 Py_FatalError("tok_backup: beginning of buffer");
1109 if (*tok->cur != c)
1110 *tok->cur = c;
1111 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112}
1113
1114
1115/* Return the token corresponding to a single character */
1116
1117int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001118PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 switch (c) {
1121 case '(': return LPAR;
1122 case ')': return RPAR;
1123 case '[': return LSQB;
1124 case ']': return RSQB;
1125 case ':': return COLON;
1126 case ',': return COMMA;
1127 case ';': return SEMI;
1128 case '+': return PLUS;
1129 case '-': return MINUS;
1130 case '*': return STAR;
1131 case '/': return SLASH;
1132 case '|': return VBAR;
1133 case '&': return AMPER;
1134 case '<': return LESS;
1135 case '>': return GREATER;
1136 case '=': return EQUAL;
1137 case '.': return DOT;
1138 case '%': return PERCENT;
1139 case '{': return LBRACE;
1140 case '}': return RBRACE;
1141 case '^': return CIRCUMFLEX;
1142 case '~': return TILDE;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001143 case '@': return AT;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 default: return OP;
1145 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146}
1147
1148
Guido van Rossumfbab9051991-10-20 20:25:03 +00001149int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001150PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001152 switch (c1) {
1153 case '=':
1154 switch (c2) {
1155 case '=': return EQEQUAL;
1156 }
1157 break;
1158 case '!':
1159 switch (c2) {
1160 case '=': return NOTEQUAL;
1161 }
1162 break;
1163 case '<':
1164 switch (c2) {
1165 case '>': return NOTEQUAL;
1166 case '=': return LESSEQUAL;
1167 case '<': return LEFTSHIFT;
1168 }
1169 break;
1170 case '>':
1171 switch (c2) {
1172 case '=': return GREATEREQUAL;
1173 case '>': return RIGHTSHIFT;
1174 }
1175 break;
1176 case '+':
1177 switch (c2) {
1178 case '=': return PLUSEQUAL;
1179 }
1180 break;
1181 case '-':
1182 switch (c2) {
1183 case '=': return MINEQUAL;
1184 case '>': return RARROW;
1185 }
1186 break;
1187 case '*':
1188 switch (c2) {
1189 case '*': return DOUBLESTAR;
1190 case '=': return STAREQUAL;
1191 }
1192 break;
1193 case '/':
1194 switch (c2) {
1195 case '/': return DOUBLESLASH;
1196 case '=': return SLASHEQUAL;
1197 }
1198 break;
1199 case '|':
1200 switch (c2) {
1201 case '=': return VBAREQUAL;
1202 }
1203 break;
1204 case '%':
1205 switch (c2) {
1206 case '=': return PERCENTEQUAL;
1207 }
1208 break;
1209 case '&':
1210 switch (c2) {
1211 case '=': return AMPEREQUAL;
1212 }
1213 break;
1214 case '^':
1215 switch (c2) {
1216 case '=': return CIRCUMFLEXEQUAL;
1217 }
1218 break;
Benjamin Petersond51374e2014-04-09 23:55:56 -04001219 case '@':
1220 switch (c2) {
1221 case '=': return ATEQUAL;
1222 }
1223 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 }
1225 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001226}
1227
Thomas Wouters434d0822000-08-24 20:11:32 +00001228int
1229PyToken_ThreeChars(int c1, int c2, int c3)
1230{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 switch (c1) {
1232 case '<':
1233 switch (c2) {
1234 case '<':
1235 switch (c3) {
1236 case '=':
1237 return LEFTSHIFTEQUAL;
1238 }
1239 break;
1240 }
1241 break;
1242 case '>':
1243 switch (c2) {
1244 case '>':
1245 switch (c3) {
1246 case '=':
1247 return RIGHTSHIFTEQUAL;
1248 }
1249 break;
1250 }
1251 break;
1252 case '*':
1253 switch (c2) {
1254 case '*':
1255 switch (c3) {
1256 case '=':
1257 return DOUBLESTAREQUAL;
1258 }
1259 break;
1260 }
1261 break;
1262 case '/':
1263 switch (c2) {
1264 case '/':
1265 switch (c3) {
1266 case '=':
1267 return DOUBLESLASHEQUAL;
1268 }
1269 break;
1270 }
1271 break;
1272 case '.':
1273 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001274 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 switch (c3) {
1276 case '.':
1277 return ELLIPSIS;
1278 }
1279 break;
1280 }
1281 break;
1282 }
1283 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001284}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001285
Guido van Rossum926f13a1998-04-09 21:38:06 +00001286static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001287indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001288{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 if (tok->alterror) {
1290 tok->done = E_TABSPACE;
1291 tok->cur = tok->inp;
1292 return 1;
1293 }
1294 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001295#ifdef PGEN
1296 PySys_WriteStderr("inconsistent use of tabs and spaces "
1297 "in indentation\n");
1298#else
1299 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001301#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 tok->altwarning = 0;
1303 }
1304 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001305}
1306
Martin v. Löwis47383402007-08-15 07:32:56 +00001307#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001308#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001309#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310/* Verify that the identifier follows PEP 3131.
1311 All identifier strings are guaranteed to be "ready" unicode objects.
1312 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001313static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001314verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001315{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 PyObject *s;
1317 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001318 if (tok->decoding_erred)
1319 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323 PyErr_Clear();
1324 tok->done = E_IDENTIFIER;
1325 } else {
1326 tok->done = E_ERROR;
1327 }
1328 return 0;
1329 }
1330 result = PyUnicode_IsIdentifier(s);
1331 Py_DECREF(s);
1332 if (result == 0)
1333 tok->done = E_IDENTIFIER;
1334 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001335}
1336#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001337
Brett Cannona721aba2016-09-09 14:57:09 -07001338static int
1339tok_decimal_tail(struct tok_state *tok)
1340{
1341 int c;
1342
1343 while (1) {
1344 do {
1345 c = tok_nextc(tok);
1346 } while (isdigit(c));
1347 if (c != '_') {
1348 break;
1349 }
1350 c = tok_nextc(tok);
1351 if (!isdigit(c)) {
1352 tok->done = E_TOKEN;
1353 tok_backup(tok, c);
1354 return 0;
1355 }
1356 }
1357 return c;
1358}
1359
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360/* Get next token, after space stripping etc. */
1361
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001362static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001363tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001365 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001369 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370 tok->start = NULL;
1371 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001372
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 /* Get indentation level */
1374 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001375 int col = 0;
1376 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 tok->atbol = 0;
1378 for (;;) {
1379 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001380 if (c == ' ') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 col++, altcol++;
Brett Cannona721aba2016-09-09 14:57:09 -07001382 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 else if (c == '\t') {
1384 col = (col/tok->tabsize + 1) * tok->tabsize;
1385 altcol = (altcol/tok->alttabsize + 1)
1386 * tok->alttabsize;
1387 }
Brett Cannona721aba2016-09-09 14:57:09 -07001388 else if (c == '\014') {/* Control-L (formfeed) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 col = altcol = 0; /* For Emacs users */
Brett Cannona721aba2016-09-09 14:57:09 -07001390 }
1391 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001393 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 }
1395 tok_backup(tok, c);
1396 if (c == '#' || c == '\n') {
1397 /* Lines with only whitespace and/or comments
1398 shouldn't affect the indentation and are
1399 not passed to the parser as NEWLINE tokens,
1400 except *totally* empty lines in interactive
1401 mode, which signal the end of a command group. */
Brett Cannona721aba2016-09-09 14:57:09 -07001402 if (col == 0 && c == '\n' && tok->prompt != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 blankline = 0; /* Let it through */
Brett Cannona721aba2016-09-09 14:57:09 -07001404 }
1405 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406 blankline = 1; /* Ignore completely */
Brett Cannona721aba2016-09-09 14:57:09 -07001407 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 /* We can't jump back right here since we still
1409 may need to skip to the end of a comment */
1410 }
1411 if (!blankline && tok->level == 0) {
1412 if (col == tok->indstack[tok->indent]) {
1413 /* No change */
1414 if (altcol != tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001415 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001417 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 }
1419 }
1420 else if (col > tok->indstack[tok->indent]) {
1421 /* Indent -- always one */
1422 if (tok->indent+1 >= MAXINDENT) {
1423 tok->done = E_TOODEEP;
1424 tok->cur = tok->inp;
1425 return ERRORTOKEN;
1426 }
1427 if (altcol <= tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001428 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001430 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 }
1432 tok->pendin++;
1433 tok->indstack[++tok->indent] = col;
1434 tok->altindstack[tok->indent] = altcol;
1435 }
1436 else /* col < tok->indstack[tok->indent] */ {
1437 /* Dedent -- any number, must be consistent */
1438 while (tok->indent > 0 &&
1439 col < tok->indstack[tok->indent]) {
1440 tok->pendin--;
1441 tok->indent--;
1442 }
1443 if (col != tok->indstack[tok->indent]) {
1444 tok->done = E_DEDENT;
1445 tok->cur = tok->inp;
1446 return ERRORTOKEN;
1447 }
1448 if (altcol != tok->altindstack[tok->indent]) {
Brett Cannona721aba2016-09-09 14:57:09 -07001449 if (indenterror(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001451 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 }
1453 }
1454 }
1455 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001456
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001457 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001458
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001459 /* Return pending indents/dedents */
1460 if (tok->pendin != 0) {
1461 if (tok->pendin < 0) {
1462 tok->pendin++;
1463 return DEDENT;
1464 }
1465 else {
1466 tok->pendin--;
1467 return INDENT;
1468 }
1469 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470
Yury Selivanov96ec9342015-07-23 15:01:58 +03001471 if (tok->async_def
1472 && !blankline
1473 && tok->level == 0
1474 /* There was a NEWLINE after ASYNC DEF,
1475 so we're past the signature. */
1476 && tok->async_def_nl
1477 /* Current indentation level is less than where
1478 the async function was defined */
1479 && tok->async_def_indent >= tok->indent)
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001480 {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001481 tok->async_def = 0;
1482 tok->async_def_indent = 0;
1483 tok->async_def_nl = 0;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001484 }
1485
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001487 tok->start = NULL;
1488 /* Skip spaces */
1489 do {
1490 c = tok_nextc(tok);
1491 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Set start of current token */
1494 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001495
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 /* Skip comment */
Brett Cannona721aba2016-09-09 14:57:09 -07001497 if (c == '#') {
1498 while (c != EOF && c != '\n') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001500 }
1501 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001502
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001503 /* Check for EOF and errors now */
1504 if (c == EOF) {
1505 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1506 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001507
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001508 /* Identifier (most frequent token!) */
1509 nonascii = 0;
1510 if (is_potential_identifier_start(c)) {
Berker Peksag6f805622017-02-05 04:32:39 +03001511 /* Process the various legal combinations of b"", r"", u"", and f"". */
Eric V. Smith235a6f02015-09-19 14:51:32 -04001512 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001513 while (1) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001514 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001515 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001516 /* Since this is a backwards compatibility support literal we don't
1517 want to support it in arbitrary order like byte literals. */
Brett Cannona721aba2016-09-09 14:57:09 -07001518 else if (!(saw_b || saw_u || saw_r || saw_f)
1519 && (c == 'u'|| c == 'U')) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001520 saw_u = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001521 }
Christian Heimes0b3847d2012-06-20 11:17:58 +02001522 /* ur"" and ru"" are not supported */
Brett Cannona721aba2016-09-09 14:57:09 -07001523 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001524 saw_r = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001525 }
1526 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
Eric V. Smith235a6f02015-09-19 14:51:32 -04001527 saw_f = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001528 }
1529 else {
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001530 break;
Brett Cannona721aba2016-09-09 14:57:09 -07001531 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001533 if (c == '"' || c == '\'') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 goto letter_quote;
Brett Cannona721aba2016-09-09 14:57:09 -07001535 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001536 }
1537 while (is_potential_identifier_char(c)) {
Brett Cannona721aba2016-09-09 14:57:09 -07001538 if (c >= 128) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 nonascii = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001540 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001541 c = tok_nextc(tok);
1542 }
1543 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001544 if (nonascii && !verify_identifier(tok)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 return ERRORTOKEN;
Brett Cannona721aba2016-09-09 14:57:09 -07001546 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001547 *p_start = tok->start;
1548 *p_end = tok->cur;
Yury Selivanov75445082015-05-11 22:57:16 -04001549
Yury Selivanov96ec9342015-07-23 15:01:58 +03001550 /* async/await parsing block. */
1551 if (tok->cur - tok->start == 5) {
1552 /* Current token length is 5. */
1553 if (tok->async_def) {
1554 /* We're inside an 'async def' function. */
Brett Cannona721aba2016-09-09 14:57:09 -07001555 if (memcmp(tok->start, "async", 5) == 0) {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001556 return ASYNC;
Brett Cannona721aba2016-09-09 14:57:09 -07001557 }
1558 if (memcmp(tok->start, "await", 5) == 0) {
Yury Selivanov96ec9342015-07-23 15:01:58 +03001559 return AWAIT;
Brett Cannona721aba2016-09-09 14:57:09 -07001560 }
Yury Selivanov75445082015-05-11 22:57:16 -04001561 }
Yury Selivanov96ec9342015-07-23 15:01:58 +03001562 else if (memcmp(tok->start, "async", 5) == 0) {
1563 /* The current token is 'async'.
1564 Look ahead one token.*/
Yury Selivanov8085b802015-05-18 12:50:52 -04001565
Yury Selivanov96ec9342015-07-23 15:01:58 +03001566 struct tok_state ahead_tok;
1567 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1568 int ahead_tok_kind;
Yury Selivanov8085b802015-05-18 12:50:52 -04001569
Yury Selivanov75445082015-05-11 22:57:16 -04001570 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
Yury Selivanov75445082015-05-11 22:57:16 -04001571 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
Yury Selivanov96ec9342015-07-23 15:01:58 +03001572 &ahead_tok_end);
Yury Selivanov75445082015-05-11 22:57:16 -04001573
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001574 if (ahead_tok_kind == NAME
1575 && ahead_tok.cur - ahead_tok.start == 3
1576 && memcmp(ahead_tok.start, "def", 3) == 0)
1577 {
1578 /* The next token is going to be 'def', so instead of
1579 returning 'async' NAME token, we return ASYNC. */
Yury Selivanov96ec9342015-07-23 15:01:58 +03001580 tok->async_def_indent = tok->indent;
1581 tok->async_def = 1;
Yury Selivanov75445082015-05-11 22:57:16 -04001582 return ASYNC;
1583 }
Yury Selivanov75445082015-05-11 22:57:16 -04001584 }
1585 }
1586
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 return NAME;
1588 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 /* Newline */
1591 if (c == '\n') {
1592 tok->atbol = 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001593 if (blankline || tok->level > 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001594 goto nextline;
Brett Cannona721aba2016-09-09 14:57:09 -07001595 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 *p_start = tok->start;
1597 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1598 tok->cont_line = 0;
Yury Selivanov96ec9342015-07-23 15:01:58 +03001599 if (tok->async_def) {
1600 /* We're somewhere inside an 'async def' function, and
1601 we've encountered a NEWLINE after its signature. */
1602 tok->async_def_nl = 1;
Yury Selivanov8fb307c2015-07-22 13:33:45 +03001603 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 return NEWLINE;
1605 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001606
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 /* Period or number starting with period? */
1608 if (c == '.') {
1609 c = tok_nextc(tok);
1610 if (isdigit(c)) {
1611 goto fraction;
1612 } else if (c == '.') {
1613 c = tok_nextc(tok);
1614 if (c == '.') {
1615 *p_start = tok->start;
1616 *p_end = tok->cur;
1617 return ELLIPSIS;
Brett Cannona721aba2016-09-09 14:57:09 -07001618 }
1619 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 tok_backup(tok, c);
1621 }
1622 tok_backup(tok, '.');
Brett Cannona721aba2016-09-09 14:57:09 -07001623 }
1624 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 tok_backup(tok, c);
1626 }
1627 *p_start = tok->start;
1628 *p_end = tok->cur;
1629 return DOT;
1630 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001631
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001632 /* Number */
1633 if (isdigit(c)) {
1634 if (c == '0') {
1635 /* Hex, octal or binary -- maybe. */
1636 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 if (c == 'x' || c == 'X') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 /* Hex */
1639 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001640 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001641 if (c == '_') {
1642 c = tok_nextc(tok);
1643 }
1644 if (!isxdigit(c)) {
1645 tok->done = E_TOKEN;
1646 tok_backup(tok, c);
1647 return ERRORTOKEN;
1648 }
1649 do {
1650 c = tok_nextc(tok);
1651 } while (isxdigit(c));
1652 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 }
1654 else if (c == 'o' || c == 'O') {
1655 /* Octal */
1656 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001657 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001658 if (c == '_') {
1659 c = tok_nextc(tok);
1660 }
1661 if (c < '0' || c >= '8') {
1662 tok->done = E_TOKEN;
1663 tok_backup(tok, c);
1664 return ERRORTOKEN;
1665 }
1666 do {
1667 c = tok_nextc(tok);
1668 } while ('0' <= c && c < '8');
1669 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 }
1671 else if (c == 'b' || c == 'B') {
1672 /* Binary */
1673 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 do {
Brett Cannona721aba2016-09-09 14:57:09 -07001675 if (c == '_') {
1676 c = tok_nextc(tok);
1677 }
1678 if (c != '0' && c != '1') {
1679 tok->done = E_TOKEN;
1680 tok_backup(tok, c);
1681 return ERRORTOKEN;
1682 }
1683 do {
1684 c = tok_nextc(tok);
1685 } while (c == '0' || c == '1');
1686 } while (c == '_');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 }
1688 else {
1689 int nonzero = 0;
1690 /* maybe old-style octal; c is first char of it */
1691 /* in any case, allow '0' as a literal */
Brett Cannona721aba2016-09-09 14:57:09 -07001692 while (1) {
1693 if (c == '_') {
1694 c = tok_nextc(tok);
1695 if (!isdigit(c)) {
1696 tok->done = E_TOKEN;
1697 tok_backup(tok, c);
1698 return ERRORTOKEN;
1699 }
1700 }
1701 if (c != '0') {
1702 break;
1703 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 c = tok_nextc(tok);
1705 }
Brett Cannona721aba2016-09-09 14:57:09 -07001706 if (isdigit(c)) {
1707 nonzero = 1;
1708 c = tok_decimal_tail(tok);
1709 if (c == 0) {
1710 return ERRORTOKEN;
1711 }
1712 }
1713 if (c == '.') {
1714 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 goto fraction;
Brett Cannona721aba2016-09-09 14:57:09 -07001716 }
1717 else if (c == 'e' || c == 'E') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 goto exponent;
Brett Cannona721aba2016-09-09 14:57:09 -07001719 }
1720 else if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 goto imaginary;
Brett Cannona721aba2016-09-09 14:57:09 -07001722 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 else if (nonzero) {
Brett Cannona721aba2016-09-09 14:57:09 -07001724 /* Old-style octal: now disallowed. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001725 tok->done = E_TOKEN;
1726 tok_backup(tok, c);
1727 return ERRORTOKEN;
1728 }
1729 }
1730 }
1731 else {
1732 /* Decimal */
Brett Cannona721aba2016-09-09 14:57:09 -07001733 c = tok_decimal_tail(tok);
1734 if (c == 0) {
1735 return ERRORTOKEN;
1736 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 {
1738 /* Accept floating point numbers. */
1739 if (c == '.') {
Brett Cannona721aba2016-09-09 14:57:09 -07001740 c = tok_nextc(tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001741 fraction:
1742 /* Fraction */
Brett Cannona721aba2016-09-09 14:57:09 -07001743 if (isdigit(c)) {
1744 c = tok_decimal_tail(tok);
1745 if (c == 0) {
1746 return ERRORTOKEN;
1747 }
1748 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 }
1750 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001751 int e;
1752 exponent:
1753 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 /* Exponent part */
1755 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001756 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001757 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001758 if (!isdigit(c)) {
1759 tok->done = E_TOKEN;
1760 tok_backup(tok, c);
1761 return ERRORTOKEN;
1762 }
1763 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001764 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001765 tok_backup(tok, e);
1766 *p_start = tok->start;
1767 *p_end = tok->cur;
1768 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 }
Brett Cannona721aba2016-09-09 14:57:09 -07001770 c = tok_decimal_tail(tok);
1771 if (c == 0) {
1772 return ERRORTOKEN;
1773 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 }
Brett Cannona721aba2016-09-09 14:57:09 -07001775 if (c == 'j' || c == 'J') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001776 /* Imaginary part */
1777 imaginary:
1778 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001779 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001780 }
1781 }
1782 tok_backup(tok, c);
1783 *p_start = tok->start;
1784 *p_end = tok->cur;
1785 return NUMBER;
1786 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001787
1788 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001789 /* String */
1790 if (c == '\'' || c == '"') {
1791 int quote = c;
1792 int quote_size = 1; /* 1 or 3 */
1793 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001794
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001795 /* Find the quote size and start of string */
1796 c = tok_nextc(tok);
1797 if (c == quote) {
1798 c = tok_nextc(tok);
Brett Cannona721aba2016-09-09 14:57:09 -07001799 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001800 quote_size = 3;
Brett Cannona721aba2016-09-09 14:57:09 -07001801 }
1802 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001803 end_quote_size = 1; /* empty string found */
Brett Cannona721aba2016-09-09 14:57:09 -07001804 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001805 }
Brett Cannona721aba2016-09-09 14:57:09 -07001806 if (c != quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001807 tok_backup(tok, c);
Brett Cannona721aba2016-09-09 14:57:09 -07001808 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001809
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001810 /* Get rest of string */
1811 while (end_quote_size != quote_size) {
1812 c = tok_nextc(tok);
1813 if (c == EOF) {
Brett Cannona721aba2016-09-09 14:57:09 -07001814 if (quote_size == 3) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001815 tok->done = E_EOFS;
Brett Cannona721aba2016-09-09 14:57:09 -07001816 }
1817 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001818 tok->done = E_EOLS;
Brett Cannona721aba2016-09-09 14:57:09 -07001819 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001820 tok->cur = tok->inp;
1821 return ERRORTOKEN;
1822 }
1823 if (quote_size == 1 && c == '\n') {
1824 tok->done = E_EOLS;
1825 tok->cur = tok->inp;
1826 return ERRORTOKEN;
1827 }
Brett Cannona721aba2016-09-09 14:57:09 -07001828 if (c == quote) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001829 end_quote_size += 1;
Brett Cannona721aba2016-09-09 14:57:09 -07001830 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001831 else {
1832 end_quote_size = 0;
Brett Cannona721aba2016-09-09 14:57:09 -07001833 if (c == '\\') {
Christian Heimesc6cc23d2016-09-09 00:09:45 +02001834 tok_nextc(tok); /* skip escaped char */
Brett Cannona721aba2016-09-09 14:57:09 -07001835 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001836 }
1837 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001838
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 *p_start = tok->start;
1840 *p_end = tok->cur;
1841 return STRING;
1842 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001843
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001844 /* Line continuation */
1845 if (c == '\\') {
1846 c = tok_nextc(tok);
1847 if (c != '\n') {
1848 tok->done = E_LINECONT;
1849 tok->cur = tok->inp;
1850 return ERRORTOKEN;
1851 }
1852 tok->cont_line = 1;
1853 goto again; /* Read next line */
1854 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001855
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001856 /* Check for two-character token */
1857 {
1858 int c2 = tok_nextc(tok);
1859 int token = PyToken_TwoChars(c, c2);
1860 if (token != OP) {
1861 int c3 = tok_nextc(tok);
1862 int token3 = PyToken_ThreeChars(c, c2, c3);
1863 if (token3 != OP) {
1864 token = token3;
Brett Cannona721aba2016-09-09 14:57:09 -07001865 }
1866 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001867 tok_backup(tok, c3);
1868 }
1869 *p_start = tok->start;
1870 *p_end = tok->cur;
1871 return token;
1872 }
1873 tok_backup(tok, c2);
1874 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001875
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001876 /* Keep track of parentheses nesting level */
1877 switch (c) {
1878 case '(':
1879 case '[':
1880 case '{':
1881 tok->level++;
1882 break;
1883 case ')':
1884 case ']':
1885 case '}':
1886 tok->level--;
1887 break;
1888 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001889
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001890 /* Punctuation character */
1891 *p_start = tok->start;
1892 *p_end = tok->cur;
1893 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001894}
1895
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001896int
1897PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1898{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001899 int result = tok_get(tok, p_start, p_end);
1900 if (tok->decoding_erred) {
1901 result = ERRORTOKEN;
1902 tok->done = E_DECODE;
1903 }
1904 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001905}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001906
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001907/* Get the encoding of a Python file. Check for the coding cookie and check if
1908 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001909
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001910 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1911 encoding in the first or second line of the file (in which case the encoding
1912 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001913
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001914 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1915 by the caller. */
1916
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001917char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001918PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001919{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001920 struct tok_state *tok;
1921 FILE *fp;
1922 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001923
Victor Stinnerdaf45552013-08-28 00:53:59 +02001924#ifndef PGEN
1925 fd = _Py_dup(fd);
1926#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001927 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001928#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001929 if (fd < 0) {
1930 return NULL;
1931 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001932
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001933 fp = fdopen(fd, "r");
1934 if (fp == NULL) {
1935 return NULL;
1936 }
1937 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1938 if (tok == NULL) {
1939 fclose(fp);
1940 return NULL;
1941 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001942#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001943 if (filename != NULL) {
1944 Py_INCREF(filename);
1945 tok->filename = filename;
1946 }
1947 else {
1948 tok->filename = PyUnicode_FromString("<string>");
1949 if (tok->filename == NULL) {
1950 fclose(fp);
1951 PyTokenizer_Free(tok);
1952 return encoding;
1953 }
1954 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001955#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001956 while (tok->lineno < 2 && tok->done == E_OK) {
1957 PyTokenizer_Get(tok, &p_start, &p_end);
1958 }
1959 fclose(fp);
1960 if (tok->encoding) {
1961 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1962 if (encoding)
1963 strcpy(encoding, tok->encoding);
1964 }
1965 PyTokenizer_Free(tok);
1966 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001967}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001968
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001969char *
1970PyTokenizer_FindEncoding(int fd)
1971{
1972 return PyTokenizer_FindEncodingFilename(fd, NULL);
1973}
1974
Guido van Rossum408027e1996-12-30 16:17:54 +00001975#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001976
1977void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001978tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001979{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001980 printf("%s", _PyParser_TokenNames[type]);
1981 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1982 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001983}
1984
1985#endif