blob: 5e041ea5b309dd322dde006164d9900ef16abfde [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200286 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200288 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200290 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 if (!get_coding_spec(line, &cs, size, tok))
292 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200293 if (!cs) {
294 Py_ssize_t i;
295 for (i = 0; i < size; i++) {
296 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297 break;
298 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299 /* Stop checking coding spec after a line containing
300 * anything except a comment. */
301 tok->read_coding_spec = 1;
302 break;
303 }
304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200306 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700307 tok->read_coding_spec = 1;
308 if (tok->encoding == NULL) {
309 assert(tok->decoding_state == STATE_RAW);
310 if (strcmp(cs, "utf-8") == 0) {
311 tok->encoding = cs;
312 } else {
313 r = set_readline(tok, cs);
314 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300319 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 "encoding problem: %s", cs);
321 PyMem_FREE(cs);
322 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 } else { /* then, compare cs with BOM */
325 r = (strcmp(tok->encoding, cs) == 0);
326 if (!r)
327 PyErr_Format(PyExc_SyntaxError,
328 "encoding problem: %s with BOM", cs);
329 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332}
333
334/* See whether the file starts with a BOM. If it does,
335 invoke the set_readline function with the new encoding.
336 Return 1 on success, 0 on failure. */
337
338static int
339check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 void unget_char(int, struct tok_state *),
341 int set_readline(struct tok_state *, const char *),
342 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 int ch1, ch2, ch3;
345 ch1 = get_char(tok);
346 tok->decoding_state = STATE_RAW;
347 if (ch1 == EOF) {
348 return 1;
349 } else if (ch1 == 0xEF) {
350 ch2 = get_char(tok);
351 if (ch2 != 0xBB) {
352 unget_char(ch2, tok);
353 unget_char(ch1, tok);
354 return 1;
355 }
356 ch3 = get_char(tok);
357 if (ch3 != 0xBF) {
358 unget_char(ch3, tok);
359 unget_char(ch2, tok);
360 unget_char(ch1, tok);
361 return 1;
362 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Disable support for UTF-16 BOMs until a decision
365 is made whether this needs to be supported. */
366 } else if (ch1 == 0xFE) {
367 ch2 = get_char(tok);
368 if (ch2 != 0xFF) {
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
373 if (!set_readline(tok, "utf-16-be"))
374 return 0;
375 tok->decoding_state = STATE_NORMAL;
376 } else if (ch1 == 0xFF) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFE) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-le"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 } else {
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (tok->encoding != NULL)
392 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700393 tok->encoding = new_string("utf-8", 5, tok);
394 if (!tok->encoding)
395 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 /* No need to set_readline: input is already utf-8 */
397 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398}
399
400/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000403 On entry, tok->decoding_buffer will be one of:
404 1) NULL: need to call tok->decoding_readline to get a new line
405 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 (in the s buffer) to copy entire contents of the line read
409 by tok->decoding_readline. tok->decoding_buffer has the overflow.
410 In this case, fp_readl is called in a loop (with an expanded buffer)
411 until the buffer ends with a '\n' (or until the end of the file is
412 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415static char *
416fp_readl(char *s, int size, struct tok_state *tok)
417{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 PyObject* bufobj;
419 const char *buf;
420 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 /* Ask for one less byte so we can terminate it */
423 assert(size > 0);
424 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 if (tok->decoding_buffer) {
427 bufobj = tok->decoding_buffer;
428 Py_INCREF(bufobj);
429 }
430 else
431 {
432 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
433 if (bufobj == NULL)
434 goto error;
435 }
436 if (PyUnicode_CheckExact(bufobj))
437 {
438 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
439 if (buf == NULL) {
440 goto error;
441 }
442 }
443 else
444 {
445 buf = PyByteArray_AsString(bufobj);
446 if (buf == NULL) {
447 goto error;
448 }
449 buflen = PyByteArray_GET_SIZE(bufobj);
450 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 Py_XDECREF(tok->decoding_buffer);
453 if (buflen > size) {
454 /* Too many chars, the rest goes into tok->decoding_buffer */
455 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
456 buflen-size);
457 if (tok->decoding_buffer == NULL)
458 goto error;
459 buflen = size;
460 }
461 else
462 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 memcpy(s, buf, buflen);
465 s[buflen] = '\0';
466 if (buflen == 0) /* EOF */
467 s = NULL;
468 Py_DECREF(bufobj);
469 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000470
471error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 Py_XDECREF(bufobj);
473 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Set the readline function for TOK to a StreamReader's
477 readline function. The StreamReader is named ENC.
478
479 This function is called from check_bom and check_coding_spec.
480
481 ENC is usually identical to the future value of tok->encoding,
482 except for the (currently unsupported) case of UTF-16.
483
484 Return 1 on success, 0 on failure. */
485
486static int
487fp_setreadl(struct tok_state *tok, const char* enc)
488{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(open);
491 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000492 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200493 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 io = PyImport_ImportModuleNoBlock("io");
496 if (io == NULL)
497 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Victor Stinner22a351a2010-10-14 12:04:34 +0000499 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200500 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100501 * position of tok->fp. If tok->fp was opened in text mode on Windows,
502 * its file position counts CRLF as one char and can't be directly mapped
503 * to the file offset for fd. Instead we step back one byte and read to
504 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200505 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100506 if (pos == -1 ||
507 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000508 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
509 goto cleanup;
510 }
511
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200512 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000513 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 if (stream == NULL)
515 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200518 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100520 if (pos > 0) {
521 if (PyObject_CallObject(readline, NULL) == NULL) {
522 readline = NULL;
523 goto cleanup;
524 }
525 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000526
527 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000528 Py_XDECREF(stream);
529 Py_XDECREF(io);
530 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531}
532
533/* Fetch the next byte from TOK. */
534
535static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537}
538
539/* Unfetch the last byte back into TOK. */
540
541static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000545/* Check whether the characters at s start a valid
546 UTF-8 sequence. Return the number of characters forming
547 the sequence if yes, 0 if not. */
548static int valid_utf8(const unsigned char* s)
549{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 int expected = 0;
551 int length;
552 if (*s < 0x80)
553 /* single-byte code */
554 return 1;
555 if (*s < 0xc0)
556 /* following byte */
557 return 0;
558 if (*s < 0xE0)
559 expected = 1;
560 else if (*s < 0xF0)
561 expected = 2;
562 else if (*s < 0xF8)
563 expected = 3;
564 else
565 return 0;
566 length = expected + 1;
567 for (; expected; expected--)
568 if (s[expected] < 0x80 || s[expected] >= 0xC0)
569 return 0;
570 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571}
572
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573/* Read a line of input from TOK. Determine encoding
574 if necessary. */
575
576static char *
577decoding_fgets(char *s, int size, struct tok_state *tok)
578{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 char *line = NULL;
580 int badchar = 0;
581 for (;;) {
582 if (tok->decoding_state == STATE_NORMAL) {
583 /* We already have a codec associated with
584 this input. */
585 line = fp_readl(s, size, tok);
586 break;
587 } else if (tok->decoding_state == STATE_RAW) {
588 /* We want a 'raw' read. */
589 line = Py_UniversalNewlineFgets(s, size,
590 tok->fp, NULL);
591 break;
592 } else {
593 /* We have not yet determined the encoding.
594 If an encoding is found, use the file-pointer
595 reader functions from now on. */
596 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
597 return error_ret(tok);
598 assert(tok->decoding_state != STATE_INIT);
599 }
600 }
601 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
602 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
603 return error_ret(tok);
604 }
605 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 /* The default encoding is UTF-8, so make sure we don't have any
608 non-UTF-8 sequences in it. */
609 if (line && !tok->encoding) {
610 unsigned char *c;
611 int length;
612 for (c = (unsigned char *)line; *c; c += length)
613 if (!(length = valid_utf8(c))) {
614 badchar = *c;
615 break;
616 }
617 }
618 if (badchar) {
619 /* Need to add 1 to the line number, since this line
620 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200621 PyErr_Format(PyExc_SyntaxError,
622 "Non-UTF-8 code starting with '\\x%.2x' "
623 "in file %U on line %i, "
624 "but no encoding declared; "
625 "see http://python.org/dev/peps/pep-0263/ for details",
626 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 return error_ret(tok);
628 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000629#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633static int
634decoding_feof(struct tok_state *tok)
635{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 if (tok->decoding_state != STATE_NORMAL) {
637 return feof(tok->fp);
638 } else {
639 PyObject* buf = tok->decoding_buffer;
640 if (buf == NULL) {
641 buf = PyObject_CallObject(tok->decoding_readline, NULL);
642 if (buf == NULL) {
643 error_ret(tok);
644 return 1;
645 } else {
646 tok->decoding_buffer = buf;
647 }
648 }
649 return PyObject_Length(buf) == 0;
650 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651}
652
653/* Fetch a byte from TOK, using the string buffer. */
654
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000655static int
656buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658}
659
660/* Unfetch a byte from TOK, using the string buffer. */
661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000662static void
663buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 tok->str--;
665 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666}
667
668/* Set the readline function for TOK to ENC. For the string-based
669 tokenizer, this means to just record the encoding. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static int
672buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->enc = enc;
674 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Return a UTF-8 encoding Python string object from the
678 C byte string STR, which is encoded with ENC. */
679
680static PyObject *
681translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 PyObject *utf8;
683 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
684 if (buf == NULL)
685 return NULL;
686 utf8 = PyUnicode_AsUTF8String(buf);
687 Py_DECREF(buf);
688 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000689}
690
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000691
692static char *
693translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200694 int skip_next_lf = 0;
695 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 char *buf, *current;
697 char c = '\0';
698 buf = PyMem_MALLOC(needed_length);
699 if (buf == NULL) {
700 tok->done = E_NOMEM;
701 return NULL;
702 }
703 for (current = buf; *s; s++, current++) {
704 c = *s;
705 if (skip_next_lf) {
706 skip_next_lf = 0;
707 if (c == '\n') {
708 c = *++s;
709 if (!c)
710 break;
711 }
712 }
713 if (c == '\r') {
714 skip_next_lf = 1;
715 c = '\n';
716 }
717 *current = c;
718 }
719 /* If this is exec input, add a newline to the end of the string if
720 there isn't one already. */
721 if (exec_input && c != '\n') {
722 *current = '\n';
723 current++;
724 }
725 *current = '\0';
726 final_length = current - buf + 1;
727 if (final_length < needed_length && final_length)
728 /* should never fail */
729 buf = PyMem_REALLOC(buf, final_length);
730 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000731}
732
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000733/* Decode a byte string STR for use as the buffer of TOK.
734 Look for encoding declarations inside STR, and record them
735 inside TOK. */
736
737static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000738decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000739{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 PyObject* utf8 = NULL;
741 const char *str;
742 const char *s;
743 const char *newl[2] = {NULL, NULL};
744 int lineno = 0;
745 tok->input = str = translate_newlines(input, single, tok);
746 if (str == NULL)
747 return NULL;
748 tok->enc = NULL;
749 tok->str = str;
750 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
751 return error_ret(tok);
752 str = tok->str; /* string after BOM if any */
753 assert(str);
754 if (tok->enc != NULL) {
755 utf8 = translate_into_utf8(str, tok->enc);
756 if (utf8 == NULL)
757 return error_ret(tok);
758 str = PyBytes_AsString(utf8);
759 }
760 for (s = str;; s++) {
761 if (*s == '\0') break;
762 else if (*s == '\n') {
763 assert(lineno < 2);
764 newl[lineno] = s;
765 lineno++;
766 if (lineno == 2) break;
767 }
768 }
769 tok->enc = NULL;
770 /* need to check line 1 and 2 separately since check_coding_spec
771 assumes a single line as input */
772 if (newl[0]) {
773 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
774 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200775 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
777 tok, buf_setreadl))
778 return error_ret(tok);
779 }
780 }
781 if (tok->enc != NULL) {
782 assert(utf8 == NULL);
783 utf8 = translate_into_utf8(str, tok->enc);
784 if (utf8 == NULL)
785 return error_ret(tok);
786 str = PyBytes_AS_STRING(utf8);
787 }
788 assert(tok->decoding_buffer == NULL);
789 tok->decoding_buffer = utf8; /* CAUTION */
790 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000791}
792
793#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000794
795/* Set up tokenizer for string */
796
797struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000798PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000799{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 struct tok_state *tok = tok_new();
801 if (tok == NULL)
802 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300803 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 if (str == NULL) {
805 PyTokenizer_Free(tok);
806 return NULL;
807 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000808
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 /* XXX: constify members. */
810 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
811 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812}
813
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000814struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000815PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000816{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 struct tok_state *tok = tok_new();
818 if (tok == NULL)
819 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000820#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000822#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 if (str == NULL) {
824 PyTokenizer_Free(tok);
825 return NULL;
826 }
827 tok->decoding_state = STATE_RAW;
828 tok->read_coding_spec = 1;
829 tok->enc = NULL;
830 tok->str = str;
831 tok->encoding = (char *)PyMem_MALLOC(6);
832 if (!tok->encoding) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000837
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 /* XXX: constify members. */
839 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
840 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000841}
842
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000843/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000844
845struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300846PyTokenizer_FromFile(FILE *fp, const char* enc,
847 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000848{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 struct tok_state *tok = tok_new();
850 if (tok == NULL)
851 return NULL;
852 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
853 PyTokenizer_Free(tok);
854 return NULL;
855 }
856 tok->cur = tok->inp = tok->buf;
857 tok->end = tok->buf + BUFSIZ;
858 tok->fp = fp;
859 tok->prompt = ps1;
860 tok->nextprompt = ps2;
861 if (enc != NULL) {
862 /* Must copy encoding declaration since it
863 gets copied into the parse tree. */
864 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
865 if (!tok->encoding) {
866 PyTokenizer_Free(tok);
867 return NULL;
868 }
869 strcpy(tok->encoding, enc);
870 tok->decoding_state = STATE_NORMAL;
871 }
872 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873}
874
875
876/* Free a tok_state structure */
877
878void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000879PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000880{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000881 if (tok->encoding != NULL)
882 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000883#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000884 Py_XDECREF(tok->decoding_readline);
885 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200886 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000887#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 if (tok->fp != NULL && tok->buf != NULL)
889 PyMem_FREE(tok->buf);
890 if (tok->input)
891 PyMem_FREE((char *)tok->input);
892 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000893}
894
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000895/* Get next char, updating state; error code goes into tok->done */
896
897static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200898tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000899{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 for (;;) {
901 if (tok->cur != tok->inp) {
902 return Py_CHARMASK(*tok->cur++); /* Fast path */
903 }
904 if (tok->done != E_OK)
905 return EOF;
906 if (tok->fp == NULL) {
907 char *end = strchr(tok->inp, '\n');
908 if (end != NULL)
909 end++;
910 else {
911 end = strchr(tok->inp, '\0');
912 if (end == tok->inp) {
913 tok->done = E_EOF;
914 return EOF;
915 }
916 }
917 if (tok->start == NULL)
918 tok->buf = tok->cur;
919 tok->line_start = tok->cur;
920 tok->lineno++;
921 tok->inp = end;
922 return Py_CHARMASK(*tok->cur++);
923 }
924 if (tok->prompt != NULL) {
925 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000926#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000927 if (newtok != NULL) {
928 char *translated = translate_newlines(newtok, 0, tok);
929 PyMem_FREE(newtok);
930 if (translated == NULL)
931 return EOF;
932 newtok = translated;
933 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000934 if (tok->encoding && newtok && *newtok) {
935 /* Recode to UTF-8 */
936 Py_ssize_t buflen;
937 const char* buf;
938 PyObject *u = translate_into_utf8(newtok, tok->encoding);
939 PyMem_FREE(newtok);
940 if (!u) {
941 tok->done = E_DECODE;
942 return EOF;
943 }
944 buflen = PyBytes_GET_SIZE(u);
945 buf = PyBytes_AS_STRING(u);
946 if (!buf) {
947 Py_DECREF(u);
948 tok->done = E_DECODE;
949 return EOF;
950 }
951 newtok = PyMem_MALLOC(buflen+1);
952 strcpy(newtok, buf);
953 Py_DECREF(u);
954 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000955#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000956 if (tok->nextprompt != NULL)
957 tok->prompt = tok->nextprompt;
958 if (newtok == NULL)
959 tok->done = E_INTR;
960 else if (*newtok == '\0') {
961 PyMem_FREE(newtok);
962 tok->done = E_EOF;
963 }
964 else if (tok->start != NULL) {
965 size_t start = tok->start - tok->buf;
966 size_t oldlen = tok->cur - tok->buf;
967 size_t newlen = oldlen + strlen(newtok);
968 char *buf = tok->buf;
969 buf = (char *)PyMem_REALLOC(buf, newlen+1);
970 tok->lineno++;
971 if (buf == NULL) {
972 PyMem_FREE(tok->buf);
973 tok->buf = NULL;
974 PyMem_FREE(newtok);
975 tok->done = E_NOMEM;
976 return EOF;
977 }
978 tok->buf = buf;
979 tok->cur = tok->buf + oldlen;
980 tok->line_start = tok->cur;
981 strcpy(tok->buf + oldlen, newtok);
982 PyMem_FREE(newtok);
983 tok->inp = tok->buf + newlen;
984 tok->end = tok->inp + 1;
985 tok->start = tok->buf + start;
986 }
987 else {
988 tok->lineno++;
989 if (tok->buf != NULL)
990 PyMem_FREE(tok->buf);
991 tok->buf = newtok;
992 tok->line_start = tok->buf;
993 tok->cur = tok->buf;
994 tok->line_start = tok->buf;
995 tok->inp = strchr(tok->buf, '\0');
996 tok->end = tok->inp + 1;
997 }
998 }
999 else {
1000 int done = 0;
1001 Py_ssize_t cur = 0;
1002 char *pt;
1003 if (tok->start == NULL) {
1004 if (tok->buf == NULL) {
1005 tok->buf = (char *)
1006 PyMem_MALLOC(BUFSIZ);
1007 if (tok->buf == NULL) {
1008 tok->done = E_NOMEM;
1009 return EOF;
1010 }
1011 tok->end = tok->buf + BUFSIZ;
1012 }
1013 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1014 tok) == NULL) {
1015 tok->done = E_EOF;
1016 done = 1;
1017 }
1018 else {
1019 tok->done = E_OK;
1020 tok->inp = strchr(tok->buf, '\0');
1021 done = tok->inp[-1] == '\n';
1022 }
1023 }
1024 else {
1025 cur = tok->cur - tok->buf;
1026 if (decoding_feof(tok)) {
1027 tok->done = E_EOF;
1028 done = 1;
1029 }
1030 else
1031 tok->done = E_OK;
1032 }
1033 tok->lineno++;
1034 /* Read until '\n' or EOF */
1035 while (!done) {
1036 Py_ssize_t curstart = tok->start == NULL ? -1 :
1037 tok->start - tok->buf;
1038 Py_ssize_t curvalid = tok->inp - tok->buf;
1039 Py_ssize_t newsize = curvalid + BUFSIZ;
1040 char *newbuf = tok->buf;
1041 newbuf = (char *)PyMem_REALLOC(newbuf,
1042 newsize);
1043 if (newbuf == NULL) {
1044 tok->done = E_NOMEM;
1045 tok->cur = tok->inp;
1046 return EOF;
1047 }
1048 tok->buf = newbuf;
1049 tok->inp = tok->buf + curvalid;
1050 tok->end = tok->buf + newsize;
1051 tok->start = curstart < 0 ? NULL :
1052 tok->buf + curstart;
1053 if (decoding_fgets(tok->inp,
1054 (int)(tok->end - tok->inp),
1055 tok) == NULL) {
1056 /* Break out early on decoding
1057 errors, as tok->buf will be NULL
1058 */
1059 if (tok->decoding_erred)
1060 return EOF;
1061 /* Last line does not end in \n,
1062 fake one */
1063 strcpy(tok->inp, "\n");
1064 }
1065 tok->inp = strchr(tok->inp, '\0');
1066 done = tok->inp[-1] == '\n';
1067 }
1068 if (tok->buf != NULL) {
1069 tok->cur = tok->buf + cur;
1070 tok->line_start = tok->cur;
1071 /* replace "\r\n" with "\n" */
1072 /* For Mac leave the \r, giving a syntax error */
1073 pt = tok->inp - 2;
1074 if (pt >= tok->buf && *pt == '\r') {
1075 *pt++ = '\n';
1076 *pt = '\0';
1077 tok->inp = pt;
1078 }
1079 }
1080 }
1081 if (tok->done != E_OK) {
1082 if (tok->prompt != NULL)
1083 PySys_WriteStderr("\n");
1084 tok->cur = tok->inp;
1085 return EOF;
1086 }
1087 }
1088 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089}
1090
1091
1092/* Back-up one character */
1093
1094static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001095tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001096{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 if (c != EOF) {
1098 if (--tok->cur < tok->buf)
1099 Py_FatalError("tok_backup: beginning of buffer");
1100 if (*tok->cur != c)
1101 *tok->cur = c;
1102 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103}
1104
1105
1106/* Return the token corresponding to a single character */
1107
1108int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001109PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001110{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 switch (c) {
1112 case '(': return LPAR;
1113 case ')': return RPAR;
1114 case '[': return LSQB;
1115 case ']': return RSQB;
1116 case ':': return COLON;
1117 case ',': return COMMA;
1118 case ';': return SEMI;
1119 case '+': return PLUS;
1120 case '-': return MINUS;
1121 case '*': return STAR;
1122 case '/': return SLASH;
1123 case '|': return VBAR;
1124 case '&': return AMPER;
1125 case '<': return LESS;
1126 case '>': return GREATER;
1127 case '=': return EQUAL;
1128 case '.': return DOT;
1129 case '%': return PERCENT;
1130 case '{': return LBRACE;
1131 case '}': return RBRACE;
1132 case '^': return CIRCUMFLEX;
1133 case '~': return TILDE;
1134 case '@': return AT;
1135 default: return OP;
1136 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137}
1138
1139
Guido van Rossumfbab9051991-10-20 20:25:03 +00001140int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001141PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001142{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 switch (c1) {
1144 case '=':
1145 switch (c2) {
1146 case '=': return EQEQUAL;
1147 }
1148 break;
1149 case '!':
1150 switch (c2) {
1151 case '=': return NOTEQUAL;
1152 }
1153 break;
1154 case '<':
1155 switch (c2) {
1156 case '>': return NOTEQUAL;
1157 case '=': return LESSEQUAL;
1158 case '<': return LEFTSHIFT;
1159 }
1160 break;
1161 case '>':
1162 switch (c2) {
1163 case '=': return GREATEREQUAL;
1164 case '>': return RIGHTSHIFT;
1165 }
1166 break;
1167 case '+':
1168 switch (c2) {
1169 case '=': return PLUSEQUAL;
1170 }
1171 break;
1172 case '-':
1173 switch (c2) {
1174 case '=': return MINEQUAL;
1175 case '>': return RARROW;
1176 }
1177 break;
1178 case '*':
1179 switch (c2) {
1180 case '*': return DOUBLESTAR;
1181 case '=': return STAREQUAL;
1182 }
1183 break;
1184 case '/':
1185 switch (c2) {
1186 case '/': return DOUBLESLASH;
1187 case '=': return SLASHEQUAL;
1188 }
1189 break;
1190 case '|':
1191 switch (c2) {
1192 case '=': return VBAREQUAL;
1193 }
1194 break;
1195 case '%':
1196 switch (c2) {
1197 case '=': return PERCENTEQUAL;
1198 }
1199 break;
1200 case '&':
1201 switch (c2) {
1202 case '=': return AMPEREQUAL;
1203 }
1204 break;
1205 case '^':
1206 switch (c2) {
1207 case '=': return CIRCUMFLEXEQUAL;
1208 }
1209 break;
1210 }
1211 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001212}
1213
Thomas Wouters434d0822000-08-24 20:11:32 +00001214int
1215PyToken_ThreeChars(int c1, int c2, int c3)
1216{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 switch (c1) {
1218 case '<':
1219 switch (c2) {
1220 case '<':
1221 switch (c3) {
1222 case '=':
1223 return LEFTSHIFTEQUAL;
1224 }
1225 break;
1226 }
1227 break;
1228 case '>':
1229 switch (c2) {
1230 case '>':
1231 switch (c3) {
1232 case '=':
1233 return RIGHTSHIFTEQUAL;
1234 }
1235 break;
1236 }
1237 break;
1238 case '*':
1239 switch (c2) {
1240 case '*':
1241 switch (c3) {
1242 case '=':
1243 return DOUBLESTAREQUAL;
1244 }
1245 break;
1246 }
1247 break;
1248 case '/':
1249 switch (c2) {
1250 case '/':
1251 switch (c3) {
1252 case '=':
1253 return DOUBLESLASHEQUAL;
1254 }
1255 break;
1256 }
1257 break;
1258 case '.':
1259 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001260 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001261 switch (c3) {
1262 case '.':
1263 return ELLIPSIS;
1264 }
1265 break;
1266 }
1267 break;
1268 }
1269 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001270}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001271
Guido van Rossum926f13a1998-04-09 21:38:06 +00001272static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001273indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 if (tok->alterror) {
1276 tok->done = E_TABSPACE;
1277 tok->cur = tok->inp;
1278 return 1;
1279 }
1280 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001281#ifdef PGEN
1282 PySys_WriteStderr("inconsistent use of tabs and spaces "
1283 "in indentation\n");
1284#else
1285 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001287#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 tok->altwarning = 0;
1289 }
1290 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001291}
1292
Martin v. Löwis47383402007-08-15 07:32:56 +00001293#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001294#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001295#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296/* Verify that the identifier follows PEP 3131.
1297 All identifier strings are guaranteed to be "ready" unicode objects.
1298 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001299static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001300verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001301{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 PyObject *s;
1303 int result;
Benjamin Petersond73aca72015-04-21 12:05:19 -04001304 if (tok->decoding_erred)
1305 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1309 PyErr_Clear();
1310 tok->done = E_IDENTIFIER;
1311 } else {
1312 tok->done = E_ERROR;
1313 }
1314 return 0;
1315 }
1316 result = PyUnicode_IsIdentifier(s);
1317 Py_DECREF(s);
1318 if (result == 0)
1319 tok->done = E_IDENTIFIER;
1320 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001321}
1322#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001323
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324/* Get next token, after space stripping etc. */
1325
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001326static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001327tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001329 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001331
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001333 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 tok->start = NULL;
1335 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001336
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 /* Get indentation level */
1338 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001339 int col = 0;
1340 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 tok->atbol = 0;
1342 for (;;) {
1343 c = tok_nextc(tok);
1344 if (c == ' ')
1345 col++, altcol++;
1346 else if (c == '\t') {
1347 col = (col/tok->tabsize + 1) * tok->tabsize;
1348 altcol = (altcol/tok->alttabsize + 1)
1349 * tok->alttabsize;
1350 }
1351 else if (c == '\014') /* Control-L (formfeed) */
1352 col = altcol = 0; /* For Emacs users */
1353 else
1354 break;
1355 }
1356 tok_backup(tok, c);
1357 if (c == '#' || c == '\n') {
1358 /* Lines with only whitespace and/or comments
1359 shouldn't affect the indentation and are
1360 not passed to the parser as NEWLINE tokens,
1361 except *totally* empty lines in interactive
1362 mode, which signal the end of a command group. */
1363 if (col == 0 && c == '\n' && tok->prompt != NULL)
1364 blankline = 0; /* Let it through */
1365 else
1366 blankline = 1; /* Ignore completely */
1367 /* We can't jump back right here since we still
1368 may need to skip to the end of a comment */
1369 }
1370 if (!blankline && tok->level == 0) {
1371 if (col == tok->indstack[tok->indent]) {
1372 /* No change */
1373 if (altcol != tok->altindstack[tok->indent]) {
1374 if (indenterror(tok))
1375 return ERRORTOKEN;
1376 }
1377 }
1378 else if (col > tok->indstack[tok->indent]) {
1379 /* Indent -- always one */
1380 if (tok->indent+1 >= MAXINDENT) {
1381 tok->done = E_TOODEEP;
1382 tok->cur = tok->inp;
1383 return ERRORTOKEN;
1384 }
1385 if (altcol <= tok->altindstack[tok->indent]) {
1386 if (indenterror(tok))
1387 return ERRORTOKEN;
1388 }
1389 tok->pendin++;
1390 tok->indstack[++tok->indent] = col;
1391 tok->altindstack[tok->indent] = altcol;
1392 }
1393 else /* col < tok->indstack[tok->indent] */ {
1394 /* Dedent -- any number, must be consistent */
1395 while (tok->indent > 0 &&
1396 col < tok->indstack[tok->indent]) {
1397 tok->pendin--;
1398 tok->indent--;
1399 }
1400 if (col != tok->indstack[tok->indent]) {
1401 tok->done = E_DEDENT;
1402 tok->cur = tok->inp;
1403 return ERRORTOKEN;
1404 }
1405 if (altcol != tok->altindstack[tok->indent]) {
1406 if (indenterror(tok))
1407 return ERRORTOKEN;
1408 }
1409 }
1410 }
1411 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 /* Return pending indents/dedents */
1416 if (tok->pendin != 0) {
1417 if (tok->pendin < 0) {
1418 tok->pendin++;
1419 return DEDENT;
1420 }
1421 else {
1422 tok->pendin--;
1423 return INDENT;
1424 }
1425 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001426
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001427 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 tok->start = NULL;
1429 /* Skip spaces */
1430 do {
1431 c = tok_nextc(tok);
1432 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001433
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001434 /* Set start of current token */
1435 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001436
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 /* Skip comment */
1438 if (c == '#')
1439 while (c != EOF && c != '\n')
1440 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001441
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 /* Check for EOF and errors now */
1443 if (c == EOF) {
1444 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1445 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 /* Identifier (most frequent token!) */
1448 nonascii = 0;
1449 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001450 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001451 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001452 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001453 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001454 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001455 /* Since this is a backwards compatibility support literal we don't
1456 want to support it in arbitrary order like byte literals. */
1457 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1458 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001459 /* ur"" and ru"" are not supported */
1460 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001461 saw_r = 1;
1462 else
1463 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 c = tok_nextc(tok);
1465 if (c == '"' || c == '\'')
1466 goto letter_quote;
1467 }
1468 while (is_potential_identifier_char(c)) {
1469 if (c >= 128)
1470 nonascii = 1;
1471 c = tok_nextc(tok);
1472 }
1473 tok_backup(tok, c);
Benjamin Petersond73aca72015-04-21 12:05:19 -04001474 if (nonascii && !verify_identifier(tok))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001475 return ERRORTOKEN;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001476 *p_start = tok->start;
1477 *p_end = tok->cur;
1478 return NAME;
1479 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001480
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 /* Newline */
1482 if (c == '\n') {
1483 tok->atbol = 1;
1484 if (blankline || tok->level > 0)
1485 goto nextline;
1486 *p_start = tok->start;
1487 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1488 tok->cont_line = 0;
1489 return NEWLINE;
1490 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001492 /* Period or number starting with period? */
1493 if (c == '.') {
1494 c = tok_nextc(tok);
1495 if (isdigit(c)) {
1496 goto fraction;
1497 } else if (c == '.') {
1498 c = tok_nextc(tok);
1499 if (c == '.') {
1500 *p_start = tok->start;
1501 *p_end = tok->cur;
1502 return ELLIPSIS;
1503 } else {
1504 tok_backup(tok, c);
1505 }
1506 tok_backup(tok, '.');
1507 } else {
1508 tok_backup(tok, c);
1509 }
1510 *p_start = tok->start;
1511 *p_end = tok->cur;
1512 return DOT;
1513 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001514
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 /* Number */
1516 if (isdigit(c)) {
1517 if (c == '0') {
1518 /* Hex, octal or binary -- maybe. */
1519 c = tok_nextc(tok);
1520 if (c == '.')
1521 goto fraction;
1522 if (c == 'j' || c == 'J')
1523 goto imaginary;
1524 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001525
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001526 /* Hex */
1527 c = tok_nextc(tok);
1528 if (!isxdigit(c)) {
1529 tok->done = E_TOKEN;
1530 tok_backup(tok, c);
1531 return ERRORTOKEN;
1532 }
1533 do {
1534 c = tok_nextc(tok);
1535 } while (isxdigit(c));
1536 }
1537 else if (c == 'o' || c == 'O') {
1538 /* Octal */
1539 c = tok_nextc(tok);
1540 if (c < '0' || c >= '8') {
1541 tok->done = E_TOKEN;
1542 tok_backup(tok, c);
1543 return ERRORTOKEN;
1544 }
1545 do {
1546 c = tok_nextc(tok);
1547 } while ('0' <= c && c < '8');
1548 }
1549 else if (c == 'b' || c == 'B') {
1550 /* Binary */
1551 c = tok_nextc(tok);
1552 if (c != '0' && c != '1') {
1553 tok->done = E_TOKEN;
1554 tok_backup(tok, c);
1555 return ERRORTOKEN;
1556 }
1557 do {
1558 c = tok_nextc(tok);
1559 } while (c == '0' || c == '1');
1560 }
1561 else {
1562 int nonzero = 0;
1563 /* maybe old-style octal; c is first char of it */
1564 /* in any case, allow '0' as a literal */
1565 while (c == '0')
1566 c = tok_nextc(tok);
1567 while (isdigit(c)) {
1568 nonzero = 1;
1569 c = tok_nextc(tok);
1570 }
1571 if (c == '.')
1572 goto fraction;
1573 else if (c == 'e' || c == 'E')
1574 goto exponent;
1575 else if (c == 'j' || c == 'J')
1576 goto imaginary;
1577 else if (nonzero) {
1578 tok->done = E_TOKEN;
1579 tok_backup(tok, c);
1580 return ERRORTOKEN;
1581 }
1582 }
1583 }
1584 else {
1585 /* Decimal */
1586 do {
1587 c = tok_nextc(tok);
1588 } while (isdigit(c));
1589 {
1590 /* Accept floating point numbers. */
1591 if (c == '.') {
1592 fraction:
1593 /* Fraction */
1594 do {
1595 c = tok_nextc(tok);
1596 } while (isdigit(c));
1597 }
1598 if (c == 'e' || c == 'E') {
Benjamin Petersonc4161622014-06-07 12:36:39 -07001599 int e;
1600 exponent:
1601 e = c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001602 /* Exponent part */
1603 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001604 if (c == '+' || c == '-') {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001605 c = tok_nextc(tok);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001606 if (!isdigit(c)) {
1607 tok->done = E_TOKEN;
1608 tok_backup(tok, c);
1609 return ERRORTOKEN;
1610 }
1611 } else if (!isdigit(c)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 tok_backup(tok, c);
Benjamin Petersonc4161622014-06-07 12:36:39 -07001613 tok_backup(tok, e);
1614 *p_start = tok->start;
1615 *p_end = tok->cur;
1616 return NUMBER;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 }
1618 do {
1619 c = tok_nextc(tok);
1620 } while (isdigit(c));
1621 }
1622 if (c == 'j' || c == 'J')
1623 /* Imaginary part */
1624 imaginary:
1625 c = tok_nextc(tok);
1626 }
1627 }
1628 tok_backup(tok, c);
1629 *p_start = tok->start;
1630 *p_end = tok->cur;
1631 return NUMBER;
1632 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001633
1634 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 /* String */
1636 if (c == '\'' || c == '"') {
1637 int quote = c;
1638 int quote_size = 1; /* 1 or 3 */
1639 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001640
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 /* Find the quote size and start of string */
1642 c = tok_nextc(tok);
1643 if (c == quote) {
1644 c = tok_nextc(tok);
1645 if (c == quote)
1646 quote_size = 3;
1647 else
1648 end_quote_size = 1; /* empty string found */
1649 }
1650 if (c != quote)
1651 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001652
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 /* Get rest of string */
1654 while (end_quote_size != quote_size) {
1655 c = tok_nextc(tok);
1656 if (c == EOF) {
1657 if (quote_size == 3)
1658 tok->done = E_EOFS;
1659 else
1660 tok->done = E_EOLS;
1661 tok->cur = tok->inp;
1662 return ERRORTOKEN;
1663 }
1664 if (quote_size == 1 && c == '\n') {
1665 tok->done = E_EOLS;
1666 tok->cur = tok->inp;
1667 return ERRORTOKEN;
1668 }
1669 if (c == quote)
1670 end_quote_size += 1;
1671 else {
1672 end_quote_size = 0;
1673 if (c == '\\')
1674 c = tok_nextc(tok); /* skip escaped char */
1675 }
1676 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001677
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 *p_start = tok->start;
1679 *p_end = tok->cur;
1680 return STRING;
1681 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001682
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683 /* Line continuation */
1684 if (c == '\\') {
1685 c = tok_nextc(tok);
1686 if (c != '\n') {
1687 tok->done = E_LINECONT;
1688 tok->cur = tok->inp;
1689 return ERRORTOKEN;
1690 }
1691 tok->cont_line = 1;
1692 goto again; /* Read next line */
1693 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001694
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 /* Check for two-character token */
1696 {
1697 int c2 = tok_nextc(tok);
1698 int token = PyToken_TwoChars(c, c2);
1699 if (token != OP) {
1700 int c3 = tok_nextc(tok);
1701 int token3 = PyToken_ThreeChars(c, c2, c3);
1702 if (token3 != OP) {
1703 token = token3;
1704 } else {
1705 tok_backup(tok, c3);
1706 }
1707 *p_start = tok->start;
1708 *p_end = tok->cur;
1709 return token;
1710 }
1711 tok_backup(tok, c2);
1712 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001713
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 /* Keep track of parentheses nesting level */
1715 switch (c) {
1716 case '(':
1717 case '[':
1718 case '{':
1719 tok->level++;
1720 break;
1721 case ')':
1722 case ']':
1723 case '}':
1724 tok->level--;
1725 break;
1726 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001727
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 /* Punctuation character */
1729 *p_start = tok->start;
1730 *p_end = tok->cur;
1731 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001732}
1733
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001734int
1735PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1736{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 int result = tok_get(tok, p_start, p_end);
1738 if (tok->decoding_erred) {
1739 result = ERRORTOKEN;
1740 tok->done = E_DECODE;
1741 }
1742 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001743}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001744
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001745/* Get the encoding of a Python file. Check for the coding cookie and check if
1746 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001747
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001748 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1749 encoding in the first or second line of the file (in which case the encoding
1750 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001751
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001752 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1753 by the caller. */
1754
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001755char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001756PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001757{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 struct tok_state *tok;
1759 FILE *fp;
1760 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001761
Victor Stinnerdaf45552013-08-28 00:53:59 +02001762#ifndef PGEN
1763 fd = _Py_dup(fd);
1764#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001766#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001767 if (fd < 0) {
1768 return NULL;
1769 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001770
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001771 fp = fdopen(fd, "r");
1772 if (fp == NULL) {
1773 return NULL;
1774 }
1775 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1776 if (tok == NULL) {
1777 fclose(fp);
1778 return NULL;
1779 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001780#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001781 if (filename != NULL) {
1782 Py_INCREF(filename);
1783 tok->filename = filename;
1784 }
1785 else {
1786 tok->filename = PyUnicode_FromString("<string>");
1787 if (tok->filename == NULL) {
1788 fclose(fp);
1789 PyTokenizer_Free(tok);
1790 return encoding;
1791 }
1792 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001793#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001794 while (tok->lineno < 2 && tok->done == E_OK) {
1795 PyTokenizer_Get(tok, &p_start, &p_end);
1796 }
1797 fclose(fp);
1798 if (tok->encoding) {
1799 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1800 if (encoding)
1801 strcpy(encoding, tok->encoding);
1802 }
1803 PyTokenizer_Free(tok);
1804 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001805}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001806
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001807char *
1808PyTokenizer_FindEncoding(int fd)
1809{
1810 return PyTokenizer_FindEncodingFilename(fd, NULL);
1811}
1812
Guido van Rossum408027e1996-12-30 16:17:54 +00001813#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001814
1815void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001816tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001817{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001818 printf("%s", _PyParser_TokenNames[type]);
1819 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1820 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001821}
1822
1823#endif