blob: 660c0f042bc951e7b50fe33a839d262870eec747 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200286 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200288 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200290 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 if (!get_coding_spec(line, &cs, size, tok))
292 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200293 if (!cs) {
294 Py_ssize_t i;
295 for (i = 0; i < size; i++) {
296 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297 break;
298 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299 /* Stop checking coding spec after a line containing
300 * anything except a comment. */
301 tok->read_coding_spec = 1;
302 break;
303 }
304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200306 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700307 tok->read_coding_spec = 1;
308 if (tok->encoding == NULL) {
309 assert(tok->decoding_state == STATE_RAW);
310 if (strcmp(cs, "utf-8") == 0) {
311 tok->encoding = cs;
312 } else {
313 r = set_readline(tok, cs);
314 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300319 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 "encoding problem: %s", cs);
321 PyMem_FREE(cs);
322 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 } else { /* then, compare cs with BOM */
325 r = (strcmp(tok->encoding, cs) == 0);
326 if (!r)
327 PyErr_Format(PyExc_SyntaxError,
328 "encoding problem: %s with BOM", cs);
329 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332}
333
334/* See whether the file starts with a BOM. If it does,
335 invoke the set_readline function with the new encoding.
336 Return 1 on success, 0 on failure. */
337
338static int
339check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 void unget_char(int, struct tok_state *),
341 int set_readline(struct tok_state *, const char *),
342 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 int ch1, ch2, ch3;
345 ch1 = get_char(tok);
346 tok->decoding_state = STATE_RAW;
347 if (ch1 == EOF) {
348 return 1;
349 } else if (ch1 == 0xEF) {
350 ch2 = get_char(tok);
351 if (ch2 != 0xBB) {
352 unget_char(ch2, tok);
353 unget_char(ch1, tok);
354 return 1;
355 }
356 ch3 = get_char(tok);
357 if (ch3 != 0xBF) {
358 unget_char(ch3, tok);
359 unget_char(ch2, tok);
360 unget_char(ch1, tok);
361 return 1;
362 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Disable support for UTF-16 BOMs until a decision
365 is made whether this needs to be supported. */
366 } else if (ch1 == 0xFE) {
367 ch2 = get_char(tok);
368 if (ch2 != 0xFF) {
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
373 if (!set_readline(tok, "utf-16-be"))
374 return 0;
375 tok->decoding_state = STATE_NORMAL;
376 } else if (ch1 == 0xFF) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFE) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-le"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 } else {
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (tok->encoding != NULL)
392 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700393 tok->encoding = new_string("utf-8", 5, tok);
394 if (!tok->encoding)
395 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 /* No need to set_readline: input is already utf-8 */
397 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398}
399
400/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000403 On entry, tok->decoding_buffer will be one of:
404 1) NULL: need to call tok->decoding_readline to get a new line
405 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 (in the s buffer) to copy entire contents of the line read
409 by tok->decoding_readline. tok->decoding_buffer has the overflow.
410 In this case, fp_readl is called in a loop (with an expanded buffer)
411 until the buffer ends with a '\n' (or until the end of the file is
412 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415static char *
416fp_readl(char *s, int size, struct tok_state *tok)
417{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 PyObject* bufobj;
419 const char *buf;
420 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 /* Ask for one less byte so we can terminate it */
423 assert(size > 0);
424 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 if (tok->decoding_buffer) {
427 bufobj = tok->decoding_buffer;
428 Py_INCREF(bufobj);
429 }
430 else
431 {
432 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
433 if (bufobj == NULL)
434 goto error;
435 }
436 if (PyUnicode_CheckExact(bufobj))
437 {
438 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
439 if (buf == NULL) {
440 goto error;
441 }
442 }
443 else
444 {
445 buf = PyByteArray_AsString(bufobj);
446 if (buf == NULL) {
447 goto error;
448 }
449 buflen = PyByteArray_GET_SIZE(bufobj);
450 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 Py_XDECREF(tok->decoding_buffer);
453 if (buflen > size) {
454 /* Too many chars, the rest goes into tok->decoding_buffer */
455 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
456 buflen-size);
457 if (tok->decoding_buffer == NULL)
458 goto error;
459 buflen = size;
460 }
461 else
462 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 memcpy(s, buf, buflen);
465 s[buflen] = '\0';
466 if (buflen == 0) /* EOF */
467 s = NULL;
468 Py_DECREF(bufobj);
469 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000470
471error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 Py_XDECREF(bufobj);
473 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Set the readline function for TOK to a StreamReader's
477 readline function. The StreamReader is named ENC.
478
479 This function is called from check_bom and check_coding_spec.
480
481 ENC is usually identical to the future value of tok->encoding,
482 except for the (currently unsupported) case of UTF-16.
483
484 Return 1 on success, 0 on failure. */
485
486static int
487fp_setreadl(struct tok_state *tok, const char* enc)
488{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(open);
491 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000492 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200493 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 io = PyImport_ImportModuleNoBlock("io");
496 if (io == NULL)
497 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Victor Stinner22a351a2010-10-14 12:04:34 +0000499 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200500 /* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100501 * position of tok->fp. If tok->fp was opened in text mode on Windows,
502 * its file position counts CRLF as one char and can't be directly mapped
503 * to the file offset for fd. Instead we step back one byte and read to
504 * the end of line.*/
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200505 pos = ftell(tok->fp);
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100506 if (pos == -1 ||
507 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000508 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
509 goto cleanup;
510 }
511
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200512 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000513 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 if (stream == NULL)
515 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200518 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 tok->decoding_readline = readline;
Martin v. Löwis815b41b2014-02-28 15:27:29 +0100520 if (pos > 0) {
521 if (PyObject_CallObject(readline, NULL) == NULL) {
522 readline = NULL;
523 goto cleanup;
524 }
525 }
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000526
527 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000528 Py_XDECREF(stream);
529 Py_XDECREF(io);
530 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531}
532
533/* Fetch the next byte from TOK. */
534
535static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537}
538
539/* Unfetch the last byte back into TOK. */
540
541static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000545/* Check whether the characters at s start a valid
546 UTF-8 sequence. Return the number of characters forming
547 the sequence if yes, 0 if not. */
548static int valid_utf8(const unsigned char* s)
549{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 int expected = 0;
551 int length;
552 if (*s < 0x80)
553 /* single-byte code */
554 return 1;
555 if (*s < 0xc0)
556 /* following byte */
557 return 0;
558 if (*s < 0xE0)
559 expected = 1;
560 else if (*s < 0xF0)
561 expected = 2;
562 else if (*s < 0xF8)
563 expected = 3;
564 else
565 return 0;
566 length = expected + 1;
567 for (; expected; expected--)
568 if (s[expected] < 0x80 || s[expected] >= 0xC0)
569 return 0;
570 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571}
572
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573/* Read a line of input from TOK. Determine encoding
574 if necessary. */
575
576static char *
577decoding_fgets(char *s, int size, struct tok_state *tok)
578{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000579 char *line = NULL;
580 int badchar = 0;
581 for (;;) {
582 if (tok->decoding_state == STATE_NORMAL) {
583 /* We already have a codec associated with
584 this input. */
585 line = fp_readl(s, size, tok);
586 break;
587 } else if (tok->decoding_state == STATE_RAW) {
588 /* We want a 'raw' read. */
589 line = Py_UniversalNewlineFgets(s, size,
590 tok->fp, NULL);
591 break;
592 } else {
593 /* We have not yet determined the encoding.
594 If an encoding is found, use the file-pointer
595 reader functions from now on. */
596 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
597 return error_ret(tok);
598 assert(tok->decoding_state != STATE_INIT);
599 }
600 }
601 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
602 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
603 return error_ret(tok);
604 }
605 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 /* The default encoding is UTF-8, so make sure we don't have any
608 non-UTF-8 sequences in it. */
609 if (line && !tok->encoding) {
610 unsigned char *c;
611 int length;
612 for (c = (unsigned char *)line; *c; c += length)
613 if (!(length = valid_utf8(c))) {
614 badchar = *c;
615 break;
616 }
617 }
618 if (badchar) {
619 /* Need to add 1 to the line number, since this line
620 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200621 PyErr_Format(PyExc_SyntaxError,
622 "Non-UTF-8 code starting with '\\x%.2x' "
623 "in file %U on line %i, "
624 "but no encoding declared; "
625 "see http://python.org/dev/peps/pep-0263/ for details",
626 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 return error_ret(tok);
628 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000629#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633static int
634decoding_feof(struct tok_state *tok)
635{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 if (tok->decoding_state != STATE_NORMAL) {
637 return feof(tok->fp);
638 } else {
639 PyObject* buf = tok->decoding_buffer;
640 if (buf == NULL) {
641 buf = PyObject_CallObject(tok->decoding_readline, NULL);
642 if (buf == NULL) {
643 error_ret(tok);
644 return 1;
645 } else {
646 tok->decoding_buffer = buf;
647 }
648 }
649 return PyObject_Length(buf) == 0;
650 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651}
652
653/* Fetch a byte from TOK, using the string buffer. */
654
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000655static int
656buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658}
659
660/* Unfetch a byte from TOK, using the string buffer. */
661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000662static void
663buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 tok->str--;
665 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666}
667
668/* Set the readline function for TOK to ENC. For the string-based
669 tokenizer, this means to just record the encoding. */
670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000671static int
672buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 tok->enc = enc;
674 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675}
676
677/* Return a UTF-8 encoding Python string object from the
678 C byte string STR, which is encoded with ENC. */
679
680static PyObject *
681translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 PyObject *utf8;
683 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
684 if (buf == NULL)
685 return NULL;
686 utf8 = PyUnicode_AsUTF8String(buf);
687 Py_DECREF(buf);
688 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000689}
690
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000691
692static char *
693translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
695 char *buf, *current;
696 char c = '\0';
697 buf = PyMem_MALLOC(needed_length);
698 if (buf == NULL) {
699 tok->done = E_NOMEM;
700 return NULL;
701 }
702 for (current = buf; *s; s++, current++) {
703 c = *s;
704 if (skip_next_lf) {
705 skip_next_lf = 0;
706 if (c == '\n') {
707 c = *++s;
708 if (!c)
709 break;
710 }
711 }
712 if (c == '\r') {
713 skip_next_lf = 1;
714 c = '\n';
715 }
716 *current = c;
717 }
718 /* If this is exec input, add a newline to the end of the string if
719 there isn't one already. */
720 if (exec_input && c != '\n') {
721 *current = '\n';
722 current++;
723 }
724 *current = '\0';
725 final_length = current - buf + 1;
726 if (final_length < needed_length && final_length)
727 /* should never fail */
728 buf = PyMem_REALLOC(buf, final_length);
729 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000730}
731
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000732/* Decode a byte string STR for use as the buffer of TOK.
733 Look for encoding declarations inside STR, and record them
734 inside TOK. */
735
736static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000737decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000738{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyObject* utf8 = NULL;
740 const char *str;
741 const char *s;
742 const char *newl[2] = {NULL, NULL};
743 int lineno = 0;
744 tok->input = str = translate_newlines(input, single, tok);
745 if (str == NULL)
746 return NULL;
747 tok->enc = NULL;
748 tok->str = str;
749 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
750 return error_ret(tok);
751 str = tok->str; /* string after BOM if any */
752 assert(str);
753 if (tok->enc != NULL) {
754 utf8 = translate_into_utf8(str, tok->enc);
755 if (utf8 == NULL)
756 return error_ret(tok);
757 str = PyBytes_AsString(utf8);
758 }
759 for (s = str;; s++) {
760 if (*s == '\0') break;
761 else if (*s == '\n') {
762 assert(lineno < 2);
763 newl[lineno] = s;
764 lineno++;
765 if (lineno == 2) break;
766 }
767 }
768 tok->enc = NULL;
769 /* need to check line 1 and 2 separately since check_coding_spec
770 assumes a single line as input */
771 if (newl[0]) {
772 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
773 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200774 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
776 tok, buf_setreadl))
777 return error_ret(tok);
778 }
779 }
780 if (tok->enc != NULL) {
781 assert(utf8 == NULL);
782 utf8 = translate_into_utf8(str, tok->enc);
783 if (utf8 == NULL)
784 return error_ret(tok);
785 str = PyBytes_AS_STRING(utf8);
786 }
787 assert(tok->decoding_buffer == NULL);
788 tok->decoding_buffer = utf8; /* CAUTION */
789 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000790}
791
792#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793
794/* Set up tokenizer for string */
795
796struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000797PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 struct tok_state *tok = tok_new();
800 if (tok == NULL)
801 return NULL;
802 str = (char *)decode_str(str, exec_input, tok);
803 if (str == NULL) {
804 PyTokenizer_Free(tok);
805 return NULL;
806 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000807
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 /* XXX: constify members. */
809 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
810 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811}
812
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000813struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000814PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000815{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 struct tok_state *tok = tok_new();
817 if (tok == NULL)
818 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000819#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000821#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 if (str == NULL) {
823 PyTokenizer_Free(tok);
824 return NULL;
825 }
826 tok->decoding_state = STATE_RAW;
827 tok->read_coding_spec = 1;
828 tok->enc = NULL;
829 tok->str = str;
830 tok->encoding = (char *)PyMem_MALLOC(6);
831 if (!tok->encoding) {
832 PyTokenizer_Free(tok);
833 return NULL;
834 }
835 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000836
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 /* XXX: constify members. */
838 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
839 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000840}
841
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000842/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843
844struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000845PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000847 struct tok_state *tok = tok_new();
848 if (tok == NULL)
849 return NULL;
850 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
851 PyTokenizer_Free(tok);
852 return NULL;
853 }
854 tok->cur = tok->inp = tok->buf;
855 tok->end = tok->buf + BUFSIZ;
856 tok->fp = fp;
857 tok->prompt = ps1;
858 tok->nextprompt = ps2;
859 if (enc != NULL) {
860 /* Must copy encoding declaration since it
861 gets copied into the parse tree. */
862 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
863 if (!tok->encoding) {
864 PyTokenizer_Free(tok);
865 return NULL;
866 }
867 strcpy(tok->encoding, enc);
868 tok->decoding_state = STATE_NORMAL;
869 }
870 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000871}
872
873
874/* Free a tok_state structure */
875
876void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000877PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000878{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 if (tok->encoding != NULL)
880 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000881#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 Py_XDECREF(tok->decoding_readline);
883 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200884 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000885#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000886 if (tok->fp != NULL && tok->buf != NULL)
887 PyMem_FREE(tok->buf);
888 if (tok->input)
889 PyMem_FREE((char *)tok->input);
890 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000891}
892
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000893/* Get next char, updating state; error code goes into tok->done */
894
895static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000896tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000897{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 for (;;) {
899 if (tok->cur != tok->inp) {
900 return Py_CHARMASK(*tok->cur++); /* Fast path */
901 }
902 if (tok->done != E_OK)
903 return EOF;
904 if (tok->fp == NULL) {
905 char *end = strchr(tok->inp, '\n');
906 if (end != NULL)
907 end++;
908 else {
909 end = strchr(tok->inp, '\0');
910 if (end == tok->inp) {
911 tok->done = E_EOF;
912 return EOF;
913 }
914 }
915 if (tok->start == NULL)
916 tok->buf = tok->cur;
917 tok->line_start = tok->cur;
918 tok->lineno++;
919 tok->inp = end;
920 return Py_CHARMASK(*tok->cur++);
921 }
922 if (tok->prompt != NULL) {
923 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000924#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000925 if (newtok != NULL) {
926 char *translated = translate_newlines(newtok, 0, tok);
927 PyMem_FREE(newtok);
928 if (translated == NULL)
929 return EOF;
930 newtok = translated;
931 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000932 if (tok->encoding && newtok && *newtok) {
933 /* Recode to UTF-8 */
934 Py_ssize_t buflen;
935 const char* buf;
936 PyObject *u = translate_into_utf8(newtok, tok->encoding);
937 PyMem_FREE(newtok);
938 if (!u) {
939 tok->done = E_DECODE;
940 return EOF;
941 }
942 buflen = PyBytes_GET_SIZE(u);
943 buf = PyBytes_AS_STRING(u);
944 if (!buf) {
945 Py_DECREF(u);
946 tok->done = E_DECODE;
947 return EOF;
948 }
949 newtok = PyMem_MALLOC(buflen+1);
950 strcpy(newtok, buf);
951 Py_DECREF(u);
952 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000953#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000954 if (tok->nextprompt != NULL)
955 tok->prompt = tok->nextprompt;
956 if (newtok == NULL)
957 tok->done = E_INTR;
958 else if (*newtok == '\0') {
959 PyMem_FREE(newtok);
960 tok->done = E_EOF;
961 }
962 else if (tok->start != NULL) {
963 size_t start = tok->start - tok->buf;
964 size_t oldlen = tok->cur - tok->buf;
965 size_t newlen = oldlen + strlen(newtok);
966 char *buf = tok->buf;
967 buf = (char *)PyMem_REALLOC(buf, newlen+1);
968 tok->lineno++;
969 if (buf == NULL) {
970 PyMem_FREE(tok->buf);
971 tok->buf = NULL;
972 PyMem_FREE(newtok);
973 tok->done = E_NOMEM;
974 return EOF;
975 }
976 tok->buf = buf;
977 tok->cur = tok->buf + oldlen;
978 tok->line_start = tok->cur;
979 strcpy(tok->buf + oldlen, newtok);
980 PyMem_FREE(newtok);
981 tok->inp = tok->buf + newlen;
982 tok->end = tok->inp + 1;
983 tok->start = tok->buf + start;
984 }
985 else {
986 tok->lineno++;
987 if (tok->buf != NULL)
988 PyMem_FREE(tok->buf);
989 tok->buf = newtok;
990 tok->line_start = tok->buf;
991 tok->cur = tok->buf;
992 tok->line_start = tok->buf;
993 tok->inp = strchr(tok->buf, '\0');
994 tok->end = tok->inp + 1;
995 }
996 }
997 else {
998 int done = 0;
999 Py_ssize_t cur = 0;
1000 char *pt;
1001 if (tok->start == NULL) {
1002 if (tok->buf == NULL) {
1003 tok->buf = (char *)
1004 PyMem_MALLOC(BUFSIZ);
1005 if (tok->buf == NULL) {
1006 tok->done = E_NOMEM;
1007 return EOF;
1008 }
1009 tok->end = tok->buf + BUFSIZ;
1010 }
1011 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1012 tok) == NULL) {
1013 tok->done = E_EOF;
1014 done = 1;
1015 }
1016 else {
1017 tok->done = E_OK;
1018 tok->inp = strchr(tok->buf, '\0');
1019 done = tok->inp[-1] == '\n';
1020 }
1021 }
1022 else {
1023 cur = tok->cur - tok->buf;
1024 if (decoding_feof(tok)) {
1025 tok->done = E_EOF;
1026 done = 1;
1027 }
1028 else
1029 tok->done = E_OK;
1030 }
1031 tok->lineno++;
1032 /* Read until '\n' or EOF */
1033 while (!done) {
1034 Py_ssize_t curstart = tok->start == NULL ? -1 :
1035 tok->start - tok->buf;
1036 Py_ssize_t curvalid = tok->inp - tok->buf;
1037 Py_ssize_t newsize = curvalid + BUFSIZ;
1038 char *newbuf = tok->buf;
1039 newbuf = (char *)PyMem_REALLOC(newbuf,
1040 newsize);
1041 if (newbuf == NULL) {
1042 tok->done = E_NOMEM;
1043 tok->cur = tok->inp;
1044 return EOF;
1045 }
1046 tok->buf = newbuf;
1047 tok->inp = tok->buf + curvalid;
1048 tok->end = tok->buf + newsize;
1049 tok->start = curstart < 0 ? NULL :
1050 tok->buf + curstart;
1051 if (decoding_fgets(tok->inp,
1052 (int)(tok->end - tok->inp),
1053 tok) == NULL) {
1054 /* Break out early on decoding
1055 errors, as tok->buf will be NULL
1056 */
1057 if (tok->decoding_erred)
1058 return EOF;
1059 /* Last line does not end in \n,
1060 fake one */
1061 strcpy(tok->inp, "\n");
1062 }
1063 tok->inp = strchr(tok->inp, '\0');
1064 done = tok->inp[-1] == '\n';
1065 }
1066 if (tok->buf != NULL) {
1067 tok->cur = tok->buf + cur;
1068 tok->line_start = tok->cur;
1069 /* replace "\r\n" with "\n" */
1070 /* For Mac leave the \r, giving a syntax error */
1071 pt = tok->inp - 2;
1072 if (pt >= tok->buf && *pt == '\r') {
1073 *pt++ = '\n';
1074 *pt = '\0';
1075 tok->inp = pt;
1076 }
1077 }
1078 }
1079 if (tok->done != E_OK) {
1080 if (tok->prompt != NULL)
1081 PySys_WriteStderr("\n");
1082 tok->cur = tok->inp;
1083 return EOF;
1084 }
1085 }
1086 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087}
1088
1089
1090/* Back-up one character */
1091
1092static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001093tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001094{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 if (c != EOF) {
1096 if (--tok->cur < tok->buf)
1097 Py_FatalError("tok_backup: beginning of buffer");
1098 if (*tok->cur != c)
1099 *tok->cur = c;
1100 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101}
1102
1103
1104/* Return the token corresponding to a single character */
1105
1106int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001107PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 switch (c) {
1110 case '(': return LPAR;
1111 case ')': return RPAR;
1112 case '[': return LSQB;
1113 case ']': return RSQB;
1114 case ':': return COLON;
1115 case ',': return COMMA;
1116 case ';': return SEMI;
1117 case '+': return PLUS;
1118 case '-': return MINUS;
1119 case '*': return STAR;
1120 case '/': return SLASH;
1121 case '|': return VBAR;
1122 case '&': return AMPER;
1123 case '<': return LESS;
1124 case '>': return GREATER;
1125 case '=': return EQUAL;
1126 case '.': return DOT;
1127 case '%': return PERCENT;
1128 case '{': return LBRACE;
1129 case '}': return RBRACE;
1130 case '^': return CIRCUMFLEX;
1131 case '~': return TILDE;
1132 case '@': return AT;
1133 default: return OP;
1134 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135}
1136
1137
Guido van Rossumfbab9051991-10-20 20:25:03 +00001138int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001139PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001140{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 switch (c1) {
1142 case '=':
1143 switch (c2) {
1144 case '=': return EQEQUAL;
1145 }
1146 break;
1147 case '!':
1148 switch (c2) {
1149 case '=': return NOTEQUAL;
1150 }
1151 break;
1152 case '<':
1153 switch (c2) {
1154 case '>': return NOTEQUAL;
1155 case '=': return LESSEQUAL;
1156 case '<': return LEFTSHIFT;
1157 }
1158 break;
1159 case '>':
1160 switch (c2) {
1161 case '=': return GREATEREQUAL;
1162 case '>': return RIGHTSHIFT;
1163 }
1164 break;
1165 case '+':
1166 switch (c2) {
1167 case '=': return PLUSEQUAL;
1168 }
1169 break;
1170 case '-':
1171 switch (c2) {
1172 case '=': return MINEQUAL;
1173 case '>': return RARROW;
1174 }
1175 break;
1176 case '*':
1177 switch (c2) {
1178 case '*': return DOUBLESTAR;
1179 case '=': return STAREQUAL;
1180 }
1181 break;
1182 case '/':
1183 switch (c2) {
1184 case '/': return DOUBLESLASH;
1185 case '=': return SLASHEQUAL;
1186 }
1187 break;
1188 case '|':
1189 switch (c2) {
1190 case '=': return VBAREQUAL;
1191 }
1192 break;
1193 case '%':
1194 switch (c2) {
1195 case '=': return PERCENTEQUAL;
1196 }
1197 break;
1198 case '&':
1199 switch (c2) {
1200 case '=': return AMPEREQUAL;
1201 }
1202 break;
1203 case '^':
1204 switch (c2) {
1205 case '=': return CIRCUMFLEXEQUAL;
1206 }
1207 break;
1208 }
1209 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001210}
1211
Thomas Wouters434d0822000-08-24 20:11:32 +00001212int
1213PyToken_ThreeChars(int c1, int c2, int c3)
1214{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 switch (c1) {
1216 case '<':
1217 switch (c2) {
1218 case '<':
1219 switch (c3) {
1220 case '=':
1221 return LEFTSHIFTEQUAL;
1222 }
1223 break;
1224 }
1225 break;
1226 case '>':
1227 switch (c2) {
1228 case '>':
1229 switch (c3) {
1230 case '=':
1231 return RIGHTSHIFTEQUAL;
1232 }
1233 break;
1234 }
1235 break;
1236 case '*':
1237 switch (c2) {
1238 case '*':
1239 switch (c3) {
1240 case '=':
1241 return DOUBLESTAREQUAL;
1242 }
1243 break;
1244 }
1245 break;
1246 case '/':
1247 switch (c2) {
1248 case '/':
1249 switch (c3) {
1250 case '=':
1251 return DOUBLESLASHEQUAL;
1252 }
1253 break;
1254 }
1255 break;
1256 case '.':
1257 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001258 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 switch (c3) {
1260 case '.':
1261 return ELLIPSIS;
1262 }
1263 break;
1264 }
1265 break;
1266 }
1267 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001268}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001269
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001271indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001272{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 if (tok->alterror) {
1274 tok->done = E_TABSPACE;
1275 tok->cur = tok->inp;
1276 return 1;
1277 }
1278 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001279#ifdef PGEN
1280 PySys_WriteStderr("inconsistent use of tabs and spaces "
1281 "in indentation\n");
1282#else
1283 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001285#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 tok->altwarning = 0;
1287 }
1288 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001289}
1290
Martin v. Löwis47383402007-08-15 07:32:56 +00001291#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001292#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001293#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294/* Verify that the identifier follows PEP 3131.
1295 All identifier strings are guaranteed to be "ready" unicode objects.
1296 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001297static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001298verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001299{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 PyObject *s;
1301 int result;
1302 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1305 PyErr_Clear();
1306 tok->done = E_IDENTIFIER;
1307 } else {
1308 tok->done = E_ERROR;
1309 }
1310 return 0;
1311 }
1312 result = PyUnicode_IsIdentifier(s);
1313 Py_DECREF(s);
1314 if (result == 0)
1315 tok->done = E_IDENTIFIER;
1316 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001317}
1318#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001319
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320/* Get next token, after space stripping etc. */
1321
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001322static int
1323tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 register int c;
1326 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001327
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001329 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 tok->start = NULL;
1331 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001332
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 /* Get indentation level */
1334 if (tok->atbol) {
1335 register int col = 0;
1336 register int altcol = 0;
1337 tok->atbol = 0;
1338 for (;;) {
1339 c = tok_nextc(tok);
1340 if (c == ' ')
1341 col++, altcol++;
1342 else if (c == '\t') {
1343 col = (col/tok->tabsize + 1) * tok->tabsize;
1344 altcol = (altcol/tok->alttabsize + 1)
1345 * tok->alttabsize;
1346 }
1347 else if (c == '\014') /* Control-L (formfeed) */
1348 col = altcol = 0; /* For Emacs users */
1349 else
1350 break;
1351 }
1352 tok_backup(tok, c);
1353 if (c == '#' || c == '\n') {
1354 /* Lines with only whitespace and/or comments
1355 shouldn't affect the indentation and are
1356 not passed to the parser as NEWLINE tokens,
1357 except *totally* empty lines in interactive
1358 mode, which signal the end of a command group. */
1359 if (col == 0 && c == '\n' && tok->prompt != NULL)
1360 blankline = 0; /* Let it through */
1361 else
1362 blankline = 1; /* Ignore completely */
1363 /* We can't jump back right here since we still
1364 may need to skip to the end of a comment */
1365 }
1366 if (!blankline && tok->level == 0) {
1367 if (col == tok->indstack[tok->indent]) {
1368 /* No change */
1369 if (altcol != tok->altindstack[tok->indent]) {
1370 if (indenterror(tok))
1371 return ERRORTOKEN;
1372 }
1373 }
1374 else if (col > tok->indstack[tok->indent]) {
1375 /* Indent -- always one */
1376 if (tok->indent+1 >= MAXINDENT) {
1377 tok->done = E_TOODEEP;
1378 tok->cur = tok->inp;
1379 return ERRORTOKEN;
1380 }
1381 if (altcol <= tok->altindstack[tok->indent]) {
1382 if (indenterror(tok))
1383 return ERRORTOKEN;
1384 }
1385 tok->pendin++;
1386 tok->indstack[++tok->indent] = col;
1387 tok->altindstack[tok->indent] = altcol;
1388 }
1389 else /* col < tok->indstack[tok->indent] */ {
1390 /* Dedent -- any number, must be consistent */
1391 while (tok->indent > 0 &&
1392 col < tok->indstack[tok->indent]) {
1393 tok->pendin--;
1394 tok->indent--;
1395 }
1396 if (col != tok->indstack[tok->indent]) {
1397 tok->done = E_DEDENT;
1398 tok->cur = tok->inp;
1399 return ERRORTOKEN;
1400 }
1401 if (altcol != tok->altindstack[tok->indent]) {
1402 if (indenterror(tok))
1403 return ERRORTOKEN;
1404 }
1405 }
1406 }
1407 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001408
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 /* Return pending indents/dedents */
1412 if (tok->pendin != 0) {
1413 if (tok->pendin < 0) {
1414 tok->pendin++;
1415 return DEDENT;
1416 }
1417 else {
1418 tok->pendin--;
1419 return INDENT;
1420 }
1421 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 tok->start = NULL;
1425 /* Skip spaces */
1426 do {
1427 c = tok_nextc(tok);
1428 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 /* Set start of current token */
1431 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 /* Skip comment */
1434 if (c == '#')
1435 while (c != EOF && c != '\n')
1436 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 /* Check for EOF and errors now */
1439 if (c == EOF) {
1440 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1441 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001442
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 /* Identifier (most frequent token!) */
1444 nonascii = 0;
1445 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001446 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001447 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001448 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001449 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001450 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001451 /* Since this is a backwards compatibility support literal we don't
1452 want to support it in arbitrary order like byte literals. */
1453 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1454 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001455 /* ur"" and ru"" are not supported */
1456 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001457 saw_r = 1;
1458 else
1459 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 c = tok_nextc(tok);
1461 if (c == '"' || c == '\'')
1462 goto letter_quote;
1463 }
1464 while (is_potential_identifier_char(c)) {
1465 if (c >= 128)
1466 nonascii = 1;
1467 c = tok_nextc(tok);
1468 }
1469 tok_backup(tok, c);
1470 if (nonascii &&
1471 !verify_identifier(tok)) {
1472 tok->done = E_IDENTIFIER;
1473 return ERRORTOKEN;
1474 }
1475 *p_start = tok->start;
1476 *p_end = tok->cur;
1477 return NAME;
1478 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 /* Newline */
1481 if (c == '\n') {
1482 tok->atbol = 1;
1483 if (blankline || tok->level > 0)
1484 goto nextline;
1485 *p_start = tok->start;
1486 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1487 tok->cont_line = 0;
1488 return NEWLINE;
1489 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001490
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 /* Period or number starting with period? */
1492 if (c == '.') {
1493 c = tok_nextc(tok);
1494 if (isdigit(c)) {
1495 goto fraction;
1496 } else if (c == '.') {
1497 c = tok_nextc(tok);
1498 if (c == '.') {
1499 *p_start = tok->start;
1500 *p_end = tok->cur;
1501 return ELLIPSIS;
1502 } else {
1503 tok_backup(tok, c);
1504 }
1505 tok_backup(tok, '.');
1506 } else {
1507 tok_backup(tok, c);
1508 }
1509 *p_start = tok->start;
1510 *p_end = tok->cur;
1511 return DOT;
1512 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001513
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001514 /* Number */
1515 if (isdigit(c)) {
1516 if (c == '0') {
1517 /* Hex, octal or binary -- maybe. */
1518 c = tok_nextc(tok);
1519 if (c == '.')
1520 goto fraction;
1521 if (c == 'j' || c == 'J')
1522 goto imaginary;
1523 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001524
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 /* Hex */
1526 c = tok_nextc(tok);
1527 if (!isxdigit(c)) {
1528 tok->done = E_TOKEN;
1529 tok_backup(tok, c);
1530 return ERRORTOKEN;
1531 }
1532 do {
1533 c = tok_nextc(tok);
1534 } while (isxdigit(c));
1535 }
1536 else if (c == 'o' || c == 'O') {
1537 /* Octal */
1538 c = tok_nextc(tok);
1539 if (c < '0' || c >= '8') {
1540 tok->done = E_TOKEN;
1541 tok_backup(tok, c);
1542 return ERRORTOKEN;
1543 }
1544 do {
1545 c = tok_nextc(tok);
1546 } while ('0' <= c && c < '8');
1547 }
1548 else if (c == 'b' || c == 'B') {
1549 /* Binary */
1550 c = tok_nextc(tok);
1551 if (c != '0' && c != '1') {
1552 tok->done = E_TOKEN;
1553 tok_backup(tok, c);
1554 return ERRORTOKEN;
1555 }
1556 do {
1557 c = tok_nextc(tok);
1558 } while (c == '0' || c == '1');
1559 }
1560 else {
1561 int nonzero = 0;
1562 /* maybe old-style octal; c is first char of it */
1563 /* in any case, allow '0' as a literal */
1564 while (c == '0')
1565 c = tok_nextc(tok);
1566 while (isdigit(c)) {
1567 nonzero = 1;
1568 c = tok_nextc(tok);
1569 }
1570 if (c == '.')
1571 goto fraction;
1572 else if (c == 'e' || c == 'E')
1573 goto exponent;
1574 else if (c == 'j' || c == 'J')
1575 goto imaginary;
1576 else if (nonzero) {
1577 tok->done = E_TOKEN;
1578 tok_backup(tok, c);
1579 return ERRORTOKEN;
1580 }
1581 }
1582 }
1583 else {
1584 /* Decimal */
1585 do {
1586 c = tok_nextc(tok);
1587 } while (isdigit(c));
1588 {
1589 /* Accept floating point numbers. */
1590 if (c == '.') {
1591 fraction:
1592 /* Fraction */
1593 do {
1594 c = tok_nextc(tok);
1595 } while (isdigit(c));
1596 }
1597 if (c == 'e' || c == 'E') {
1598 exponent:
1599 /* Exponent part */
1600 c = tok_nextc(tok);
1601 if (c == '+' || c == '-')
1602 c = tok_nextc(tok);
1603 if (!isdigit(c)) {
1604 tok->done = E_TOKEN;
1605 tok_backup(tok, c);
1606 return ERRORTOKEN;
1607 }
1608 do {
1609 c = tok_nextc(tok);
1610 } while (isdigit(c));
1611 }
1612 if (c == 'j' || c == 'J')
1613 /* Imaginary part */
1614 imaginary:
1615 c = tok_nextc(tok);
1616 }
1617 }
1618 tok_backup(tok, c);
1619 *p_start = tok->start;
1620 *p_end = tok->cur;
1621 return NUMBER;
1622 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001623
1624 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 /* String */
1626 if (c == '\'' || c == '"') {
1627 int quote = c;
1628 int quote_size = 1; /* 1 or 3 */
1629 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001630
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 /* Find the quote size and start of string */
1632 c = tok_nextc(tok);
1633 if (c == quote) {
1634 c = tok_nextc(tok);
1635 if (c == quote)
1636 quote_size = 3;
1637 else
1638 end_quote_size = 1; /* empty string found */
1639 }
1640 if (c != quote)
1641 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001642
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 /* Get rest of string */
1644 while (end_quote_size != quote_size) {
1645 c = tok_nextc(tok);
1646 if (c == EOF) {
1647 if (quote_size == 3)
1648 tok->done = E_EOFS;
1649 else
1650 tok->done = E_EOLS;
1651 tok->cur = tok->inp;
1652 return ERRORTOKEN;
1653 }
1654 if (quote_size == 1 && c == '\n') {
1655 tok->done = E_EOLS;
1656 tok->cur = tok->inp;
1657 return ERRORTOKEN;
1658 }
1659 if (c == quote)
1660 end_quote_size += 1;
1661 else {
1662 end_quote_size = 0;
1663 if (c == '\\')
1664 c = tok_nextc(tok); /* skip escaped char */
1665 }
1666 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001667
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 *p_start = tok->start;
1669 *p_end = tok->cur;
1670 return STRING;
1671 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001672
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 /* Line continuation */
1674 if (c == '\\') {
1675 c = tok_nextc(tok);
1676 if (c != '\n') {
1677 tok->done = E_LINECONT;
1678 tok->cur = tok->inp;
1679 return ERRORTOKEN;
1680 }
1681 tok->cont_line = 1;
1682 goto again; /* Read next line */
1683 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001684
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 /* Check for two-character token */
1686 {
1687 int c2 = tok_nextc(tok);
1688 int token = PyToken_TwoChars(c, c2);
1689 if (token != OP) {
1690 int c3 = tok_nextc(tok);
1691 int token3 = PyToken_ThreeChars(c, c2, c3);
1692 if (token3 != OP) {
1693 token = token3;
1694 } else {
1695 tok_backup(tok, c3);
1696 }
1697 *p_start = tok->start;
1698 *p_end = tok->cur;
1699 return token;
1700 }
1701 tok_backup(tok, c2);
1702 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001703
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 /* Keep track of parentheses nesting level */
1705 switch (c) {
1706 case '(':
1707 case '[':
1708 case '{':
1709 tok->level++;
1710 break;
1711 case ')':
1712 case ']':
1713 case '}':
1714 tok->level--;
1715 break;
1716 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001717
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 /* Punctuation character */
1719 *p_start = tok->start;
1720 *p_end = tok->cur;
1721 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001722}
1723
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001724int
1725PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1726{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 int result = tok_get(tok, p_start, p_end);
1728 if (tok->decoding_erred) {
1729 result = ERRORTOKEN;
1730 tok->done = E_DECODE;
1731 }
1732 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001733}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001734
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001735/* Get the encoding of a Python file. Check for the coding cookie and check if
1736 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001737
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001738 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1739 encoding in the first or second line of the file (in which case the encoding
1740 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001741
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001742 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1743 by the caller. */
1744
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001745char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001746PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001747{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 struct tok_state *tok;
1749 FILE *fp;
1750 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001751
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 fd = dup(fd);
1753 if (fd < 0) {
1754 return NULL;
1755 }
1756 fp = fdopen(fd, "r");
1757 if (fp == NULL) {
1758 return NULL;
1759 }
1760 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1761 if (tok == NULL) {
1762 fclose(fp);
1763 return NULL;
1764 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001765#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001766 if (filename != NULL) {
1767 Py_INCREF(filename);
1768 tok->filename = filename;
1769 }
1770 else {
1771 tok->filename = PyUnicode_FromString("<string>");
1772 if (tok->filename == NULL) {
1773 fclose(fp);
1774 PyTokenizer_Free(tok);
1775 return encoding;
1776 }
1777 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001778#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 while (tok->lineno < 2 && tok->done == E_OK) {
1780 PyTokenizer_Get(tok, &p_start, &p_end);
1781 }
1782 fclose(fp);
1783 if (tok->encoding) {
1784 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1785 if (encoding)
1786 strcpy(encoding, tok->encoding);
1787 }
1788 PyTokenizer_Free(tok);
1789 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001790}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001791
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001792char *
1793PyTokenizer_FindEncoding(int fd)
1794{
1795 return PyTokenizer_FindEncodingFilename(fd, NULL);
1796}
1797
Guido van Rossum408027e1996-12-30 16:17:54 +00001798#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001799
1800void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001801tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001802{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001803 printf("%s", _PyParser_TokenNames[type]);
1804 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1805 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001806}
1807
1808#endif