blob: 22accd1061aeaffaeca1bfdf013404fa169b0853 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200286 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200288 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200290 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 if (!get_coding_spec(line, &cs, size, tok))
292 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200293 if (!cs) {
294 Py_ssize_t i;
295 for (i = 0; i < size; i++) {
296 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297 break;
298 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299 /* Stop checking coding spec after a line containing
300 * anything except a comment. */
301 tok->read_coding_spec = 1;
302 break;
303 }
304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200306 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700307 tok->read_coding_spec = 1;
308 if (tok->encoding == NULL) {
309 assert(tok->decoding_state == STATE_RAW);
310 if (strcmp(cs, "utf-8") == 0) {
311 tok->encoding = cs;
312 } else {
313 r = set_readline(tok, cs);
314 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300319 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 "encoding problem: %s", cs);
321 PyMem_FREE(cs);
322 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 } else { /* then, compare cs with BOM */
325 r = (strcmp(tok->encoding, cs) == 0);
326 if (!r)
327 PyErr_Format(PyExc_SyntaxError,
328 "encoding problem: %s with BOM", cs);
329 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332}
333
334/* See whether the file starts with a BOM. If it does,
335 invoke the set_readline function with the new encoding.
336 Return 1 on success, 0 on failure. */
337
338static int
339check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 void unget_char(int, struct tok_state *),
341 int set_readline(struct tok_state *, const char *),
342 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 int ch1, ch2, ch3;
345 ch1 = get_char(tok);
346 tok->decoding_state = STATE_RAW;
347 if (ch1 == EOF) {
348 return 1;
349 } else if (ch1 == 0xEF) {
350 ch2 = get_char(tok);
351 if (ch2 != 0xBB) {
352 unget_char(ch2, tok);
353 unget_char(ch1, tok);
354 return 1;
355 }
356 ch3 = get_char(tok);
357 if (ch3 != 0xBF) {
358 unget_char(ch3, tok);
359 unget_char(ch2, tok);
360 unget_char(ch1, tok);
361 return 1;
362 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Disable support for UTF-16 BOMs until a decision
365 is made whether this needs to be supported. */
366 } else if (ch1 == 0xFE) {
367 ch2 = get_char(tok);
368 if (ch2 != 0xFF) {
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
373 if (!set_readline(tok, "utf-16-be"))
374 return 0;
375 tok->decoding_state = STATE_NORMAL;
376 } else if (ch1 == 0xFF) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFE) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-le"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 } else {
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (tok->encoding != NULL)
392 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700393 tok->encoding = new_string("utf-8", 5, tok);
394 if (!tok->encoding)
395 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 /* No need to set_readline: input is already utf-8 */
397 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398}
399
400/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000403 On entry, tok->decoding_buffer will be one of:
404 1) NULL: need to call tok->decoding_readline to get a new line
405 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 (in the s buffer) to copy entire contents of the line read
409 by tok->decoding_readline. tok->decoding_buffer has the overflow.
410 In this case, fp_readl is called in a loop (with an expanded buffer)
411 until the buffer ends with a '\n' (or until the end of the file is
412 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415static char *
416fp_readl(char *s, int size, struct tok_state *tok)
417{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 PyObject* bufobj;
419 const char *buf;
420 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 /* Ask for one less byte so we can terminate it */
423 assert(size > 0);
424 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 if (tok->decoding_buffer) {
427 bufobj = tok->decoding_buffer;
428 Py_INCREF(bufobj);
429 }
430 else
431 {
432 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
433 if (bufobj == NULL)
434 goto error;
435 }
436 if (PyUnicode_CheckExact(bufobj))
437 {
438 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
439 if (buf == NULL) {
440 goto error;
441 }
442 }
443 else
444 {
445 buf = PyByteArray_AsString(bufobj);
446 if (buf == NULL) {
447 goto error;
448 }
449 buflen = PyByteArray_GET_SIZE(bufobj);
450 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 Py_XDECREF(tok->decoding_buffer);
453 if (buflen > size) {
454 /* Too many chars, the rest goes into tok->decoding_buffer */
455 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
456 buflen-size);
457 if (tok->decoding_buffer == NULL)
458 goto error;
459 buflen = size;
460 }
461 else
462 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 memcpy(s, buf, buflen);
465 s[buflen] = '\0';
466 if (buflen == 0) /* EOF */
467 s = NULL;
468 Py_DECREF(bufobj);
469 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000470
471error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 Py_XDECREF(bufobj);
473 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Set the readline function for TOK to a StreamReader's
477 readline function. The StreamReader is named ENC.
478
479 This function is called from check_bom and check_coding_spec.
480
481 ENC is usually identical to the future value of tok->encoding,
482 except for the (currently unsupported) case of UTF-16.
483
484 Return 1 on success, 0 on failure. */
485
486static int
487fp_setreadl(struct tok_state *tok, const char* enc)
488{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(open);
491 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000492 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200493 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 io = PyImport_ImportModuleNoBlock("io");
496 if (io == NULL)
497 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Victor Stinner22a351a2010-10-14 12:04:34 +0000499 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200500 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100501 * position of tok->fp. If tok->fp was opened in text mode on Windows,
502 * its file position counts CRLF as one char and can't be directly mapped
503 * to the file offset for fd. Instead we step back one byte and read to
504 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200505 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100506 if (pos == -1 ||
507 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000508 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
509 goto cleanup;
510 }
511
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200512 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000513 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 if (stream == NULL)
515 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200518 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100520 if (pos > 0) {
521 if (PyObject_CallObject(readline, NULL) == NULL) {
522 readline = NULL;
523 goto cleanup;
524 }
525 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000526
527 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000528 Py_XDECREF(stream);
529 Py_XDECREF(io);
530 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531}
532
533/* Fetch the next byte from TOK. */
534
535static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537}
538
539/* Unfetch the last byte back into TOK. */
540
541static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000545/* Check whether the characters at s start a valid
546 UTF-8 sequence. Return the number of characters forming
547 the sequence if yes, 0 if not. */
548static int valid_utf8(const unsigned char* s)
549{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 int expected = 0;
551 int length;
552 if (*s < 0x80)
553 /* single-byte code */
554 return 1;
555 if (*s < 0xc0)
556 /* following byte */
557 return 0;
558 if (*s < 0xE0)
559 expected = 1;
560 else if (*s < 0xF0)
561 expected = 2;
562 else if (*s < 0xF8)
563 expected = 3;
564 else
565 return 0;
566 length = expected + 1;
567 for (; expected; expected--)
568 if (s[expected] < 0x80 || s[expected] >= 0xC0)
569 return 0;
570 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571}
572
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573/* Read a line of input from TOK. Determine encoding
574 if necessary. */
575
576static char *
577decoding_fgets(char *s, int size, struct tok_state *tok)
578{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 char *line = NULL;
580 int badchar = 0;
581 for (;;) {
582 if (tok->decoding_state == STATE_NORMAL) {
583 /* We already have a codec associated with
584 this input. */
585 line = fp_readl(s, size, tok);
586 break;
587 } else if (tok->decoding_state == STATE_RAW) {
588 /* We want a 'raw' read. */
589 line = Py_UniversalNewlineFgets(s, size,
590 tok->fp, NULL);
591 break;
592 } else {
593 /* We have not yet determined the encoding.
594 If an encoding is found, use the file-pointer
595 reader functions from now on. */
596 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
597 return error_ret(tok);
598 assert(tok->decoding_state != STATE_INIT);
599 }
600 }
601 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
602 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
603 return error_ret(tok);
604 }
605 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 /* The default encoding is UTF-8, so make sure we don't have any
608 non-UTF-8 sequences in it. */
609 if (line && !tok->encoding) {
610 unsigned char *c;
611 int length;
612 for (c = (unsigned char *)line; *c; c += length)
613 if (!(length = valid_utf8(c))) {
614 badchar = *c;
615 break;
616 }
617 }
618 if (badchar) {
619 /* Need to add 1 to the line number, since this line
620 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200621 PyErr_Format(PyExc_SyntaxError,
622 "Non-UTF-8 code starting with '\\x%.2x' "
623 "in file %U on line %i, "
624 "but no encoding declared; "
625 "see http://python.org/dev/peps/pep-0263/ for details",
626 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 return error_ret(tok);
628 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000629#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633static int
634decoding_feof(struct tok_state *tok)
635{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 if (tok->decoding_state != STATE_NORMAL) {
637 return feof(tok->fp);
638 } else {
639 PyObject* buf = tok->decoding_buffer;
640 if (buf == NULL) {
641 buf = PyObject_CallObject(tok->decoding_readline, NULL);
642 if (buf == NULL) {
643 error_ret(tok);
644 return 1;
645 } else {
646 tok->decoding_buffer = buf;
647 }
648 }
649 return PyObject_Length(buf) == 0;
650 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651}
652
653/* Fetch a byte from TOK, using the string buffer. */
654
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000655static int
656buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658}
659
660/* Unfetch a byte from TOK, using the string buffer. */
661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000662static void
663buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 tok->str--;
665 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666}
667
668/* Set the readline function for TOK to ENC. For the string-based
669 tokenizer, this means to just record the encoding. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static int
672buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->enc = enc;
674 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Return a UTF-8 encoding Python string object from the
678 C byte string STR, which is encoded with ENC. */
679
680static PyObject *
681translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 PyObject *utf8;
683 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
684 if (buf == NULL)
685 return NULL;
686 utf8 = PyUnicode_AsUTF8String(buf);
687 Py_DECREF(buf);
688 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000689}
690
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000691
692static char *
693translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200694 int skip_next_lf = 0;
695 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 char *buf, *current;
697 char c = '\0';
698 buf = PyMem_MALLOC(needed_length);
699 if (buf == NULL) {
700 tok->done = E_NOMEM;
701 return NULL;
702 }
703 for (current = buf; *s; s++, current++) {
704 c = *s;
705 if (skip_next_lf) {
706 skip_next_lf = 0;
707 if (c == '\n') {
708 c = *++s;
709 if (!c)
710 break;
711 }
712 }
713 if (c == '\r') {
714 skip_next_lf = 1;
715 c = '\n';
716 }
717 *current = c;
718 }
719 /* If this is exec input, add a newline to the end of the string if
720 there isn't one already. */
721 if (exec_input && c != '\n') {
722 *current = '\n';
723 current++;
724 }
725 *current = '\0';
726 final_length = current - buf + 1;
727 if (final_length < needed_length && final_length)
728 /* should never fail */
729 buf = PyMem_REALLOC(buf, final_length);
730 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000731}
732
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000733/* Decode a byte string STR for use as the buffer of TOK.
734 Look for encoding declarations inside STR, and record them
735 inside TOK. */
736
737static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000738decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000739{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 PyObject* utf8 = NULL;
741 const char *str;
742 const char *s;
743 const char *newl[2] = {NULL, NULL};
744 int lineno = 0;
745 tok->input = str = translate_newlines(input, single, tok);
746 if (str == NULL)
747 return NULL;
748 tok->enc = NULL;
749 tok->str = str;
750 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
751 return error_ret(tok);
752 str = tok->str; /* string after BOM if any */
753 assert(str);
754 if (tok->enc != NULL) {
755 utf8 = translate_into_utf8(str, tok->enc);
756 if (utf8 == NULL)
757 return error_ret(tok);
758 str = PyBytes_AsString(utf8);
759 }
760 for (s = str;; s++) {
761 if (*s == '\0') break;
762 else if (*s == '\n') {
763 assert(lineno < 2);
764 newl[lineno] = s;
765 lineno++;
766 if (lineno == 2) break;
767 }
768 }
769 tok->enc = NULL;
770 /* need to check line 1 and 2 separately since check_coding_spec
771 assumes a single line as input */
772 if (newl[0]) {
773 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
774 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200775 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
777 tok, buf_setreadl))
778 return error_ret(tok);
779 }
780 }
781 if (tok->enc != NULL) {
782 assert(utf8 == NULL);
783 utf8 = translate_into_utf8(str, tok->enc);
784 if (utf8 == NULL)
785 return error_ret(tok);
786 str = PyBytes_AS_STRING(utf8);
787 }
788 assert(tok->decoding_buffer == NULL);
789 tok->decoding_buffer = utf8; /* CAUTION */
790 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000791}
792
793#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000794
795/* Set up tokenizer for string */
796
797struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000798PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 struct tok_state *tok = tok_new();
801 if (tok == NULL)
802 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300803 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 if (str == NULL) {
805 PyTokenizer_Free(tok);
806 return NULL;
807 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000808
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 /* XXX: constify members. */
810 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
811 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812}
813
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000814struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000815PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000816{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 struct tok_state *tok = tok_new();
818 if (tok == NULL)
819 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000820#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000822#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 if (str == NULL) {
824 PyTokenizer_Free(tok);
825 return NULL;
826 }
827 tok->decoding_state = STATE_RAW;
828 tok->read_coding_spec = 1;
829 tok->enc = NULL;
830 tok->str = str;
831 tok->encoding = (char *)PyMem_MALLOC(6);
832 if (!tok->encoding) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000837
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 /* XXX: constify members. */
839 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
840 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000841}
842
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000843/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000844
845struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300846PyTokenizer_FromFile(FILE *fp, const char* enc,
847 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000848{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 struct tok_state *tok = tok_new();
850 if (tok == NULL)
851 return NULL;
852 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
853 PyTokenizer_Free(tok);
854 return NULL;
855 }
856 tok->cur = tok->inp = tok->buf;
857 tok->end = tok->buf + BUFSIZ;
858 tok->fp = fp;
859 tok->prompt = ps1;
860 tok->nextprompt = ps2;
861 if (enc != NULL) {
862 /* Must copy encoding declaration since it
863 gets copied into the parse tree. */
864 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
865 if (!tok->encoding) {
866 PyTokenizer_Free(tok);
867 return NULL;
868 }
869 strcpy(tok->encoding, enc);
870 tok->decoding_state = STATE_NORMAL;
871 }
872 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873}
874
875
876/* Free a tok_state structure */
877
878void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000879PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000880{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000881 if (tok->encoding != NULL)
882 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000883#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000884 Py_XDECREF(tok->decoding_readline);
885 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200886 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000887#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 if (tok->fp != NULL && tok->buf != NULL)
889 PyMem_FREE(tok->buf);
890 if (tok->input)
891 PyMem_FREE((char *)tok->input);
892 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000893}
894
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000895/* Get next char, updating state; error code goes into tok->done */
896
897static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200898tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000899{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 for (;;) {
901 if (tok->cur != tok->inp) {
902 return Py_CHARMASK(*tok->cur++); /* Fast path */
903 }
904 if (tok->done != E_OK)
905 return EOF;
906 if (tok->fp == NULL) {
907 char *end = strchr(tok->inp, '\n');
908 if (end != NULL)
909 end++;
910 else {
911 end = strchr(tok->inp, '\0');
912 if (end == tok->inp) {
913 tok->done = E_EOF;
914 return EOF;
915 }
916 }
917 if (tok->start == NULL)
918 tok->buf = tok->cur;
919 tok->line_start = tok->cur;
920 tok->lineno++;
921 tok->inp = end;
922 return Py_CHARMASK(*tok->cur++);
923 }
924 if (tok->prompt != NULL) {
925 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000926#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000927 if (newtok != NULL) {
928 char *translated = translate_newlines(newtok, 0, tok);
929 PyMem_FREE(newtok);
930 if (translated == NULL)
931 return EOF;
932 newtok = translated;
933 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000934 if (tok->encoding && newtok && *newtok) {
935 /* Recode to UTF-8 */
936 Py_ssize_t buflen;
937 const char* buf;
938 PyObject *u = translate_into_utf8(newtok, tok->encoding);
939 PyMem_FREE(newtok);
940 if (!u) {
941 tok->done = E_DECODE;
942 return EOF;
943 }
944 buflen = PyBytes_GET_SIZE(u);
945 buf = PyBytes_AS_STRING(u);
946 if (!buf) {
947 Py_DECREF(u);
948 tok->done = E_DECODE;
949 return EOF;
950 }
951 newtok = PyMem_MALLOC(buflen+1);
952 strcpy(newtok, buf);
953 Py_DECREF(u);
954 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000955#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000956 if (tok->nextprompt != NULL)
957 tok->prompt = tok->nextprompt;
958 if (newtok == NULL)
959 tok->done = E_INTR;
960 else if (*newtok == '\0') {
961 PyMem_FREE(newtok);
962 tok->done = E_EOF;
963 }
964 else if (tok->start != NULL) {
965 size_t start = tok->start - tok->buf;
966 size_t oldlen = tok->cur - tok->buf;
967 size_t newlen = oldlen + strlen(newtok);
968 char *buf = tok->buf;
969 buf = (char *)PyMem_REALLOC(buf, newlen+1);
970 tok->lineno++;
971 if (buf == NULL) {
972 PyMem_FREE(tok->buf);
973 tok->buf = NULL;
974 PyMem_FREE(newtok);
975 tok->done = E_NOMEM;
976 return EOF;
977 }
978 tok->buf = buf;
979 tok->cur = tok->buf + oldlen;
980 tok->line_start = tok->cur;
981 strcpy(tok->buf + oldlen, newtok);
982 PyMem_FREE(newtok);
983 tok->inp = tok->buf + newlen;
984 tok->end = tok->inp + 1;
985 tok->start = tok->buf + start;
986 }
987 else {
988 tok->lineno++;
989 if (tok->buf != NULL)
990 PyMem_FREE(tok->buf);
991 tok->buf = newtok;
992 tok->line_start = tok->buf;
993 tok->cur = tok->buf;
994 tok->line_start = tok->buf;
995 tok->inp = strchr(tok->buf, '\0');
996 tok->end = tok->inp + 1;
997 }
998 }
999 else {
1000 int done = 0;
1001 Py_ssize_t cur = 0;
1002 char *pt;
1003 if (tok->start == NULL) {
1004 if (tok->buf == NULL) {
1005 tok->buf = (char *)
1006 PyMem_MALLOC(BUFSIZ);
1007 if (tok->buf == NULL) {
1008 tok->done = E_NOMEM;
1009 return EOF;
1010 }
1011 tok->end = tok->buf + BUFSIZ;
1012 }
1013 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1014 tok) == NULL) {
1015 tok->done = E_EOF;
1016 done = 1;
1017 }
1018 else {
1019 tok->done = E_OK;
1020 tok->inp = strchr(tok->buf, '\0');
1021 done = tok->inp[-1] == '\n';
1022 }
1023 }
1024 else {
1025 cur = tok->cur - tok->buf;
1026 if (decoding_feof(tok)) {
1027 tok->done = E_EOF;
1028 done = 1;
1029 }
1030 else
1031 tok->done = E_OK;
1032 }
1033 tok->lineno++;
1034 /* Read until '\n' or EOF */
1035 while (!done) {
1036 Py_ssize_t curstart = tok->start == NULL ? -1 :
1037 tok->start - tok->buf;
1038 Py_ssize_t curvalid = tok->inp - tok->buf;
1039 Py_ssize_t newsize = curvalid + BUFSIZ;
1040 char *newbuf = tok->buf;
1041 newbuf = (char *)PyMem_REALLOC(newbuf,
1042 newsize);
1043 if (newbuf == NULL) {
1044 tok->done = E_NOMEM;
1045 tok->cur = tok->inp;
1046 return EOF;
1047 }
1048 tok->buf = newbuf;
1049 tok->inp = tok->buf + curvalid;
1050 tok->end = tok->buf + newsize;
1051 tok->start = curstart < 0 ? NULL :
1052 tok->buf + curstart;
1053 if (decoding_fgets(tok->inp,
1054 (int)(tok->end - tok->inp),
1055 tok) == NULL) {
1056 /* Break out early on decoding
1057 errors, as tok->buf will be NULL
1058 */
1059 if (tok->decoding_erred)
1060 return EOF;
1061 /* Last line does not end in \n,
1062 fake one */
1063 strcpy(tok->inp, "\n");
1064 }
1065 tok->inp = strchr(tok->inp, '\0');
1066 done = tok->inp[-1] == '\n';
1067 }
1068 if (tok->buf != NULL) {
1069 tok->cur = tok->buf + cur;
1070 tok->line_start = tok->cur;
1071 /* replace "\r\n" with "\n" */
1072 /* For Mac leave the \r, giving a syntax error */
1073 pt = tok->inp - 2;
1074 if (pt >= tok->buf && *pt == '\r') {
1075 *pt++ = '\n';
1076 *pt = '\0';
1077 tok->inp = pt;
1078 }
1079 }
1080 }
1081 if (tok->done != E_OK) {
1082 if (tok->prompt != NULL)
1083 PySys_WriteStderr("\n");
1084 tok->cur = tok->inp;
1085 return EOF;
1086 }
1087 }
1088 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089}
1090
1091
1092/* Back-up one character */
1093
1094static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001095tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001096{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 if (c != EOF) {
1098 if (--tok->cur < tok->buf)
1099 Py_FatalError("tok_backup: beginning of buffer");
1100 if (*tok->cur != c)
1101 *tok->cur = c;
1102 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103}
1104
1105
1106/* Return the token corresponding to a single character */
1107
1108int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001109PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 switch (c) {
1112 case '(': return LPAR;
1113 case ')': return RPAR;
1114 case '[': return LSQB;
1115 case ']': return RSQB;
1116 case ':': return COLON;
1117 case ',': return COMMA;
1118 case ';': return SEMI;
1119 case '+': return PLUS;
1120 case '-': return MINUS;
1121 case '*': return STAR;
1122 case '/': return SLASH;
1123 case '|': return VBAR;
1124 case '&': return AMPER;
1125 case '<': return LESS;
1126 case '>': return GREATER;
1127 case '=': return EQUAL;
1128 case '.': return DOT;
1129 case '%': return PERCENT;
1130 case '{': return LBRACE;
1131 case '}': return RBRACE;
1132 case '^': return CIRCUMFLEX;
1133 case '~': return TILDE;
1134 case '@': return AT;
1135 default: return OP;
1136 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137}
1138
1139
Guido van Rossumfbab9051991-10-20 20:25:03 +00001140int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001141PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001142{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 switch (c1) {
1144 case '=':
1145 switch (c2) {
1146 case '=': return EQEQUAL;
1147 }
1148 break;
1149 case '!':
1150 switch (c2) {
1151 case '=': return NOTEQUAL;
1152 }
1153 break;
1154 case '<':
1155 switch (c2) {
1156 case '>': return NOTEQUAL;
1157 case '=': return LESSEQUAL;
1158 case '<': return LEFTSHIFT;
1159 }
1160 break;
1161 case '>':
1162 switch (c2) {
1163 case '=': return GREATEREQUAL;
1164 case '>': return RIGHTSHIFT;
1165 }
1166 break;
1167 case '+':
1168 switch (c2) {
1169 case '=': return PLUSEQUAL;
1170 }
1171 break;
1172 case '-':
1173 switch (c2) {
1174 case '=': return MINEQUAL;
1175 case '>': return RARROW;
1176 }
1177 break;
1178 case '*':
1179 switch (c2) {
1180 case '*': return DOUBLESTAR;
1181 case '=': return STAREQUAL;
1182 }
1183 break;
1184 case '/':
1185 switch (c2) {
1186 case '/': return DOUBLESLASH;
1187 case '=': return SLASHEQUAL;
1188 }
1189 break;
1190 case '|':
1191 switch (c2) {
1192 case '=': return VBAREQUAL;
1193 }
1194 break;
1195 case '%':
1196 switch (c2) {
1197 case '=': return PERCENTEQUAL;
1198 }
1199 break;
1200 case '&':
1201 switch (c2) {
1202 case '=': return AMPEREQUAL;
1203 }
1204 break;
1205 case '^':
1206 switch (c2) {
1207 case '=': return CIRCUMFLEXEQUAL;
1208 }
1209 break;
1210 }
1211 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001212}
1213
Thomas Wouters434d0822000-08-24 20:11:32 +00001214int
1215PyToken_ThreeChars(int c1, int c2, int c3)
1216{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 switch (c1) {
1218 case '<':
1219 switch (c2) {
1220 case '<':
1221 switch (c3) {
1222 case '=':
1223 return LEFTSHIFTEQUAL;
1224 }
1225 break;
1226 }
1227 break;
1228 case '>':
1229 switch (c2) {
1230 case '>':
1231 switch (c3) {
1232 case '=':
1233 return RIGHTSHIFTEQUAL;
1234 }
1235 break;
1236 }
1237 break;
1238 case '*':
1239 switch (c2) {
1240 case '*':
1241 switch (c3) {
1242 case '=':
1243 return DOUBLESTAREQUAL;
1244 }
1245 break;
1246 }
1247 break;
1248 case '/':
1249 switch (c2) {
1250 case '/':
1251 switch (c3) {
1252 case '=':
1253 return DOUBLESLASHEQUAL;
1254 }
1255 break;
1256 }
1257 break;
1258 case '.':
1259 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001260 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001261 switch (c3) {
1262 case '.':
1263 return ELLIPSIS;
1264 }
1265 break;
1266 }
1267 break;
1268 }
1269 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001270}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001271
Guido van Rossum926f13a1998-04-09 21:38:06 +00001272static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001273indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 if (tok->alterror) {
1276 tok->done = E_TABSPACE;
1277 tok->cur = tok->inp;
1278 return 1;
1279 }
1280 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001281#ifdef PGEN
1282 PySys_WriteStderr("inconsistent use of tabs and spaces "
1283 "in indentation\n");
1284#else
1285 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001287#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 tok->altwarning = 0;
1289 }
1290 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001291}
1292
Martin v. Löwis47383402007-08-15 07:32:56 +00001293#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001294#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001295#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296/* Verify that the identifier follows PEP 3131.
1297 All identifier strings are guaranteed to be "ready" unicode objects.
1298 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001299static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001300verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001301{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 PyObject *s;
1303 int result;
1304 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1307 PyErr_Clear();
1308 tok->done = E_IDENTIFIER;
1309 } else {
1310 tok->done = E_ERROR;
1311 }
1312 return 0;
1313 }
1314 result = PyUnicode_IsIdentifier(s);
1315 Py_DECREF(s);
1316 if (result == 0)
1317 tok->done = E_IDENTIFIER;
1318 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001319}
1320#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001321
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322/* Get next token, after space stripping etc. */
1323
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001324static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001325tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001327 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001329
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001331 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 tok->start = NULL;
1333 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001334
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 /* Get indentation level */
1336 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001337 int col = 0;
1338 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 tok->atbol = 0;
1340 for (;;) {
1341 c = tok_nextc(tok);
1342 if (c == ' ')
1343 col++, altcol++;
1344 else if (c == '\t') {
1345 col = (col/tok->tabsize + 1) * tok->tabsize;
1346 altcol = (altcol/tok->alttabsize + 1)
1347 * tok->alttabsize;
1348 }
1349 else if (c == '\014') /* Control-L (formfeed) */
1350 col = altcol = 0; /* For Emacs users */
1351 else
1352 break;
1353 }
1354 tok_backup(tok, c);
1355 if (c == '#' || c == '\n') {
1356 /* Lines with only whitespace and/or comments
1357 shouldn't affect the indentation and are
1358 not passed to the parser as NEWLINE tokens,
1359 except *totally* empty lines in interactive
1360 mode, which signal the end of a command group. */
1361 if (col == 0 && c == '\n' && tok->prompt != NULL)
1362 blankline = 0; /* Let it through */
1363 else
1364 blankline = 1; /* Ignore completely */
1365 /* We can't jump back right here since we still
1366 may need to skip to the end of a comment */
1367 }
1368 if (!blankline && tok->level == 0) {
1369 if (col == tok->indstack[tok->indent]) {
1370 /* No change */
1371 if (altcol != tok->altindstack[tok->indent]) {
1372 if (indenterror(tok))
1373 return ERRORTOKEN;
1374 }
1375 }
1376 else if (col > tok->indstack[tok->indent]) {
1377 /* Indent -- always one */
1378 if (tok->indent+1 >= MAXINDENT) {
1379 tok->done = E_TOODEEP;
1380 tok->cur = tok->inp;
1381 return ERRORTOKEN;
1382 }
1383 if (altcol <= tok->altindstack[tok->indent]) {
1384 if (indenterror(tok))
1385 return ERRORTOKEN;
1386 }
1387 tok->pendin++;
1388 tok->indstack[++tok->indent] = col;
1389 tok->altindstack[tok->indent] = altcol;
1390 }
1391 else /* col < tok->indstack[tok->indent] */ {
1392 /* Dedent -- any number, must be consistent */
1393 while (tok->indent > 0 &&
1394 col < tok->indstack[tok->indent]) {
1395 tok->pendin--;
1396 tok->indent--;
1397 }
1398 if (col != tok->indstack[tok->indent]) {
1399 tok->done = E_DEDENT;
1400 tok->cur = tok->inp;
1401 return ERRORTOKEN;
1402 }
1403 if (altcol != tok->altindstack[tok->indent]) {
1404 if (indenterror(tok))
1405 return ERRORTOKEN;
1406 }
1407 }
1408 }
1409 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 /* Return pending indents/dedents */
1414 if (tok->pendin != 0) {
1415 if (tok->pendin < 0) {
1416 tok->pendin++;
1417 return DEDENT;
1418 }
1419 else {
1420 tok->pendin--;
1421 return INDENT;
1422 }
1423 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426 tok->start = NULL;
1427 /* Skip spaces */
1428 do {
1429 c = tok_nextc(tok);
1430 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001431
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 /* Set start of current token */
1433 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 /* Skip comment */
1436 if (c == '#')
1437 while (c != EOF && c != '\n')
1438 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001439
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 /* Check for EOF and errors now */
1441 if (c == EOF) {
1442 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1443 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 /* Identifier (most frequent token!) */
1446 nonascii = 0;
1447 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001448 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001449 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001450 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001451 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001452 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001453 /* Since this is a backwards compatibility support literal we don't
1454 want to support it in arbitrary order like byte literals. */
1455 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1456 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001457 /* ur"" and ru"" are not supported */
1458 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001459 saw_r = 1;
1460 else
1461 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 c = tok_nextc(tok);
1463 if (c == '"' || c == '\'')
1464 goto letter_quote;
1465 }
1466 while (is_potential_identifier_char(c)) {
1467 if (c >= 128)
1468 nonascii = 1;
1469 c = tok_nextc(tok);
1470 }
1471 tok_backup(tok, c);
1472 if (nonascii &&
1473 !verify_identifier(tok)) {
1474 tok->done = E_IDENTIFIER;
1475 return ERRORTOKEN;
1476 }
1477 *p_start = tok->start;
1478 *p_end = tok->cur;
1479 return NAME;
1480 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001481
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 /* Newline */
1483 if (c == '\n') {
1484 tok->atbol = 1;
1485 if (blankline || tok->level > 0)
1486 goto nextline;
1487 *p_start = tok->start;
1488 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1489 tok->cont_line = 0;
1490 return NEWLINE;
1491 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001492
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 /* Period or number starting with period? */
1494 if (c == '.') {
1495 c = tok_nextc(tok);
1496 if (isdigit(c)) {
1497 goto fraction;
1498 } else if (c == '.') {
1499 c = tok_nextc(tok);
1500 if (c == '.') {
1501 *p_start = tok->start;
1502 *p_end = tok->cur;
1503 return ELLIPSIS;
1504 } else {
1505 tok_backup(tok, c);
1506 }
1507 tok_backup(tok, '.');
1508 } else {
1509 tok_backup(tok, c);
1510 }
1511 *p_start = tok->start;
1512 *p_end = tok->cur;
1513 return DOT;
1514 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001515
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001516 /* Number */
1517 if (isdigit(c)) {
1518 if (c == '0') {
1519 /* Hex, octal or binary -- maybe. */
1520 c = tok_nextc(tok);
1521 if (c == '.')
1522 goto fraction;
1523 if (c == 'j' || c == 'J')
1524 goto imaginary;
1525 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001526
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 /* Hex */
1528 c = tok_nextc(tok);
1529 if (!isxdigit(c)) {
1530 tok->done = E_TOKEN;
1531 tok_backup(tok, c);
1532 return ERRORTOKEN;
1533 }
1534 do {
1535 c = tok_nextc(tok);
1536 } while (isxdigit(c));
1537 }
1538 else if (c == 'o' || c == 'O') {
1539 /* Octal */
1540 c = tok_nextc(tok);
1541 if (c < '0' || c >= '8') {
1542 tok->done = E_TOKEN;
1543 tok_backup(tok, c);
1544 return ERRORTOKEN;
1545 }
1546 do {
1547 c = tok_nextc(tok);
1548 } while ('0' <= c && c < '8');
1549 }
1550 else if (c == 'b' || c == 'B') {
1551 /* Binary */
1552 c = tok_nextc(tok);
1553 if (c != '0' && c != '1') {
1554 tok->done = E_TOKEN;
1555 tok_backup(tok, c);
1556 return ERRORTOKEN;
1557 }
1558 do {
1559 c = tok_nextc(tok);
1560 } while (c == '0' || c == '1');
1561 }
1562 else {
1563 int nonzero = 0;
1564 /* maybe old-style octal; c is first char of it */
1565 /* in any case, allow '0' as a literal */
1566 while (c == '0')
1567 c = tok_nextc(tok);
1568 while (isdigit(c)) {
1569 nonzero = 1;
1570 c = tok_nextc(tok);
1571 }
1572 if (c == '.')
1573 goto fraction;
1574 else if (c == 'e' || c == 'E')
1575 goto exponent;
1576 else if (c == 'j' || c == 'J')
1577 goto imaginary;
1578 else if (nonzero) {
1579 tok->done = E_TOKEN;
1580 tok_backup(tok, c);
1581 return ERRORTOKEN;
1582 }
1583 }
1584 }
1585 else {
1586 /* Decimal */
1587 do {
1588 c = tok_nextc(tok);
1589 } while (isdigit(c));
1590 {
1591 /* Accept floating point numbers. */
1592 if (c == '.') {
1593 fraction:
1594 /* Fraction */
1595 do {
1596 c = tok_nextc(tok);
1597 } while (isdigit(c));
1598 }
1599 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001600 int e;
1601 exponent:
1602 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 /* Exponent part */
1604 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001605 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001607 if (!isdigit(c)) {
1608 tok->done = E_TOKEN;
1609 tok_backup(tok, c);
1610 return ERRORTOKEN;
1611 }
1612 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001614 tok_backup(tok, e);
1615 *p_start = tok->start;
1616 *p_end = tok->cur;
1617 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 }
1619 do {
1620 c = tok_nextc(tok);
1621 } while (isdigit(c));
1622 }
1623 if (c == 'j' || c == 'J')
1624 /* Imaginary part */
1625 imaginary:
1626 c = tok_nextc(tok);
1627 }
1628 }
1629 tok_backup(tok, c);
1630 *p_start = tok->start;
1631 *p_end = tok->cur;
1632 return NUMBER;
1633 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001634
1635 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 /* String */
1637 if (c == '\'' || c == '"') {
1638 int quote = c;
1639 int quote_size = 1; /* 1 or 3 */
1640 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001641
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 /* Find the quote size and start of string */
1643 c = tok_nextc(tok);
1644 if (c == quote) {
1645 c = tok_nextc(tok);
1646 if (c == quote)
1647 quote_size = 3;
1648 else
1649 end_quote_size = 1; /* empty string found */
1650 }
1651 if (c != quote)
1652 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001653
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Get rest of string */
1655 while (end_quote_size != quote_size) {
1656 c = tok_nextc(tok);
1657 if (c == EOF) {
1658 if (quote_size == 3)
1659 tok->done = E_EOFS;
1660 else
1661 tok->done = E_EOLS;
1662 tok->cur = tok->inp;
1663 return ERRORTOKEN;
1664 }
1665 if (quote_size == 1 && c == '\n') {
1666 tok->done = E_EOLS;
1667 tok->cur = tok->inp;
1668 return ERRORTOKEN;
1669 }
1670 if (c == quote)
1671 end_quote_size += 1;
1672 else {
1673 end_quote_size = 0;
1674 if (c == '\\')
1675 c = tok_nextc(tok); /* skip escaped char */
1676 }
1677 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001678
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 *p_start = tok->start;
1680 *p_end = tok->cur;
1681 return STRING;
1682 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001683
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 /* Line continuation */
1685 if (c == '\\') {
1686 c = tok_nextc(tok);
1687 if (c != '\n') {
1688 tok->done = E_LINECONT;
1689 tok->cur = tok->inp;
1690 return ERRORTOKEN;
1691 }
1692 tok->cont_line = 1;
1693 goto again; /* Read next line */
1694 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001695
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 /* Check for two-character token */
1697 {
1698 int c2 = tok_nextc(tok);
1699 int token = PyToken_TwoChars(c, c2);
1700 if (token != OP) {
1701 int c3 = tok_nextc(tok);
1702 int token3 = PyToken_ThreeChars(c, c2, c3);
1703 if (token3 != OP) {
1704 token = token3;
1705 } else {
1706 tok_backup(tok, c3);
1707 }
1708 *p_start = tok->start;
1709 *p_end = tok->cur;
1710 return token;
1711 }
1712 tok_backup(tok, c2);
1713 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001714
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 /* Keep track of parentheses nesting level */
1716 switch (c) {
1717 case '(':
1718 case '[':
1719 case '{':
1720 tok->level++;
1721 break;
1722 case ')':
1723 case ']':
1724 case '}':
1725 tok->level--;
1726 break;
1727 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001728
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 /* Punctuation character */
1730 *p_start = tok->start;
1731 *p_end = tok->cur;
1732 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001733}
1734
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001735int
1736PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1737{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 int result = tok_get(tok, p_start, p_end);
1739 if (tok->decoding_erred) {
1740 result = ERRORTOKEN;
1741 tok->done = E_DECODE;
1742 }
1743 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001744}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001745
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001746/* Get the encoding of a Python file. Check for the coding cookie and check if
1747 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001748
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001749 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1750 encoding in the first or second line of the file (in which case the encoding
1751 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001752
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001753 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1754 by the caller. */
1755
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001756char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001757PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001758{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 struct tok_state *tok;
1760 FILE *fp;
1761 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001762
Victor Stinnerdaf45552013-08-28 00:53:59 +02001763#ifndef PGEN
1764 fd = _Py_dup(fd);
1765#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001767#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 if (fd < 0) {
1769 return NULL;
1770 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001771
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 fp = fdopen(fd, "r");
1773 if (fp == NULL) {
1774 return NULL;
1775 }
1776 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1777 if (tok == NULL) {
1778 fclose(fp);
1779 return NULL;
1780 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001781#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001782 if (filename != NULL) {
1783 Py_INCREF(filename);
1784 tok->filename = filename;
1785 }
1786 else {
1787 tok->filename = PyUnicode_FromString("<string>");
1788 if (tok->filename == NULL) {
1789 fclose(fp);
1790 PyTokenizer_Free(tok);
1791 return encoding;
1792 }
1793 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001794#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001795 while (tok->lineno < 2 && tok->done == E_OK) {
1796 PyTokenizer_Get(tok, &p_start, &p_end);
1797 }
1798 fclose(fp);
1799 if (tok->encoding) {
1800 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1801 if (encoding)
1802 strcpy(encoding, tok->encoding);
1803 }
1804 PyTokenizer_Free(tok);
1805 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001806}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001807
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001808char *
1809PyTokenizer_FindEncoding(int fd)
1810{
1811 return PyTokenizer_FindEncodingFilename(fd, NULL);
1812}
1813
Guido van Rossum408027e1996-12-30 16:17:54 +00001814#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001815
1816void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001817tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001818{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001819 printf("%s", _PyParser_TokenNames[type]);
1820 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1821 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001822}
1823
1824#endif