blob: 8530723c266dcd382097380ad322f8383c75d49f [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200286 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200288 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200290 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 if (!get_coding_spec(line, &cs, size, tok))
292 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200293 if (!cs) {
294 Py_ssize_t i;
295 for (i = 0; i < size; i++) {
296 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297 break;
298 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299 /* Stop checking coding spec after a line containing
300 * anything except a comment. */
301 tok->read_coding_spec = 1;
302 break;
303 }
304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200306 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700307 tok->read_coding_spec = 1;
308 if (tok->encoding == NULL) {
309 assert(tok->decoding_state == STATE_RAW);
310 if (strcmp(cs, "utf-8") == 0) {
311 tok->encoding = cs;
312 } else {
313 r = set_readline(tok, cs);
314 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300319 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 "encoding problem: %s", cs);
321 PyMem_FREE(cs);
322 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 } else { /* then, compare cs with BOM */
325 r = (strcmp(tok->encoding, cs) == 0);
326 if (!r)
327 PyErr_Format(PyExc_SyntaxError,
328 "encoding problem: %s with BOM", cs);
329 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332}
333
334/* See whether the file starts with a BOM. If it does,
335 invoke the set_readline function with the new encoding.
336 Return 1 on success, 0 on failure. */
337
338static int
339check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 void unget_char(int, struct tok_state *),
341 int set_readline(struct tok_state *, const char *),
342 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 int ch1, ch2, ch3;
345 ch1 = get_char(tok);
346 tok->decoding_state = STATE_RAW;
347 if (ch1 == EOF) {
348 return 1;
349 } else if (ch1 == 0xEF) {
350 ch2 = get_char(tok);
351 if (ch2 != 0xBB) {
352 unget_char(ch2, tok);
353 unget_char(ch1, tok);
354 return 1;
355 }
356 ch3 = get_char(tok);
357 if (ch3 != 0xBF) {
358 unget_char(ch3, tok);
359 unget_char(ch2, tok);
360 unget_char(ch1, tok);
361 return 1;
362 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Disable support for UTF-16 BOMs until a decision
365 is made whether this needs to be supported. */
366 } else if (ch1 == 0xFE) {
367 ch2 = get_char(tok);
368 if (ch2 != 0xFF) {
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
373 if (!set_readline(tok, "utf-16-be"))
374 return 0;
375 tok->decoding_state = STATE_NORMAL;
376 } else if (ch1 == 0xFF) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFE) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-le"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 } else {
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (tok->encoding != NULL)
392 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700393 tok->encoding = new_string("utf-8", 5, tok);
394 if (!tok->encoding)
395 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 /* No need to set_readline: input is already utf-8 */
397 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398}
399
400/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000403 On entry, tok->decoding_buffer will be one of:
404 1) NULL: need to call tok->decoding_readline to get a new line
405 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 (in the s buffer) to copy entire contents of the line read
409 by tok->decoding_readline. tok->decoding_buffer has the overflow.
410 In this case, fp_readl is called in a loop (with an expanded buffer)
411 until the buffer ends with a '\n' (or until the end of the file is
412 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415static char *
416fp_readl(char *s, int size, struct tok_state *tok)
417{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 PyObject* bufobj;
419 const char *buf;
420 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 /* Ask for one less byte so we can terminate it */
423 assert(size > 0);
424 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 if (tok->decoding_buffer) {
427 bufobj = tok->decoding_buffer;
428 Py_INCREF(bufobj);
429 }
430 else
431 {
432 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
433 if (bufobj == NULL)
434 goto error;
435 }
436 if (PyUnicode_CheckExact(bufobj))
437 {
438 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
439 if (buf == NULL) {
440 goto error;
441 }
442 }
443 else
444 {
445 buf = PyByteArray_AsString(bufobj);
446 if (buf == NULL) {
447 goto error;
448 }
449 buflen = PyByteArray_GET_SIZE(bufobj);
450 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 Py_XDECREF(tok->decoding_buffer);
453 if (buflen > size) {
454 /* Too many chars, the rest goes into tok->decoding_buffer */
455 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
456 buflen-size);
457 if (tok->decoding_buffer == NULL)
458 goto error;
459 buflen = size;
460 }
461 else
462 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 memcpy(s, buf, buflen);
465 s[buflen] = '\0';
466 if (buflen == 0) /* EOF */
467 s = NULL;
468 Py_DECREF(bufobj);
469 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000470
471error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 Py_XDECREF(bufobj);
473 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Set the readline function for TOK to a StreamReader's
477 readline function. The StreamReader is named ENC.
478
479 This function is called from check_bom and check_coding_spec.
480
481 ENC is usually identical to the future value of tok->encoding,
482 except for the (currently unsupported) case of UTF-16.
483
484 Return 1 on success, 0 on failure. */
485
486static int
487fp_setreadl(struct tok_state *tok, const char* enc)
488{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(open);
491 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000492 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200493 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 io = PyImport_ImportModuleNoBlock("io");
496 if (io == NULL)
497 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Victor Stinner22a351a2010-10-14 12:04:34 +0000499 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200500 /* Due to buffering the file offset for fd can be different from the file
501 * position of tok->fp. */
502 pos = ftell(tok->fp);
503 if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000504 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
505 goto cleanup;
506 }
507
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200508 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 if (stream == NULL)
511 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000513 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200514 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000515 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000516
517 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 Py_XDECREF(stream);
519 Py_XDECREF(io);
520 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521}
522
523/* Fetch the next byte from TOK. */
524
525static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527}
528
529/* Unfetch the last byte back into TOK. */
530
531static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533}
534
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000535/* Check whether the characters at s start a valid
536 UTF-8 sequence. Return the number of characters forming
537 the sequence if yes, 0 if not. */
538static int valid_utf8(const unsigned char* s)
539{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000540 int expected = 0;
541 int length;
542 if (*s < 0x80)
543 /* single-byte code */
544 return 1;
545 if (*s < 0xc0)
546 /* following byte */
547 return 0;
548 if (*s < 0xE0)
549 expected = 1;
550 else if (*s < 0xF0)
551 expected = 2;
552 else if (*s < 0xF8)
553 expected = 3;
554 else
555 return 0;
556 length = expected + 1;
557 for (; expected; expected--)
558 if (s[expected] < 0x80 || s[expected] >= 0xC0)
559 return 0;
560 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000561}
562
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563/* Read a line of input from TOK. Determine encoding
564 if necessary. */
565
566static char *
567decoding_fgets(char *s, int size, struct tok_state *tok)
568{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000569 char *line = NULL;
570 int badchar = 0;
571 for (;;) {
572 if (tok->decoding_state == STATE_NORMAL) {
573 /* We already have a codec associated with
574 this input. */
575 line = fp_readl(s, size, tok);
576 break;
577 } else if (tok->decoding_state == STATE_RAW) {
578 /* We want a 'raw' read. */
579 line = Py_UniversalNewlineFgets(s, size,
580 tok->fp, NULL);
581 break;
582 } else {
583 /* We have not yet determined the encoding.
584 If an encoding is found, use the file-pointer
585 reader functions from now on. */
586 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
587 return error_ret(tok);
588 assert(tok->decoding_state != STATE_INIT);
589 }
590 }
591 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
592 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
593 return error_ret(tok);
594 }
595 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000597 /* The default encoding is UTF-8, so make sure we don't have any
598 non-UTF-8 sequences in it. */
599 if (line && !tok->encoding) {
600 unsigned char *c;
601 int length;
602 for (c = (unsigned char *)line; *c; c += length)
603 if (!(length = valid_utf8(c))) {
604 badchar = *c;
605 break;
606 }
607 }
608 if (badchar) {
609 /* Need to add 1 to the line number, since this line
610 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200611 PyErr_Format(PyExc_SyntaxError,
612 "Non-UTF-8 code starting with '\\x%.2x' "
613 "in file %U on line %i, "
614 "but no encoding declared; "
615 "see http://python.org/dev/peps/pep-0263/ for details",
616 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000617 return error_ret(tok);
618 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621}
622
623static int
624decoding_feof(struct tok_state *tok)
625{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 if (tok->decoding_state != STATE_NORMAL) {
627 return feof(tok->fp);
628 } else {
629 PyObject* buf = tok->decoding_buffer;
630 if (buf == NULL) {
631 buf = PyObject_CallObject(tok->decoding_readline, NULL);
632 if (buf == NULL) {
633 error_ret(tok);
634 return 1;
635 } else {
636 tok->decoding_buffer = buf;
637 }
638 }
639 return PyObject_Length(buf) == 0;
640 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641}
642
643/* Fetch a byte from TOK, using the string buffer. */
644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000645static int
646buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648}
649
650/* Unfetch a byte from TOK, using the string buffer. */
651
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000652static void
653buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 tok->str--;
655 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656}
657
658/* Set the readline function for TOK to ENC. For the string-based
659 tokenizer, this means to just record the encoding. */
660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000661static int
662buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 tok->enc = enc;
664 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665}
666
667/* Return a UTF-8 encoding Python string object from the
668 C byte string STR, which is encoded with ENC. */
669
670static PyObject *
671translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 PyObject *utf8;
673 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
674 if (buf == NULL)
675 return NULL;
676 utf8 = PyUnicode_AsUTF8String(buf);
677 Py_DECREF(buf);
678 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679}
680
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000681
682static char *
683translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
685 char *buf, *current;
686 char c = '\0';
687 buf = PyMem_MALLOC(needed_length);
688 if (buf == NULL) {
689 tok->done = E_NOMEM;
690 return NULL;
691 }
692 for (current = buf; *s; s++, current++) {
693 c = *s;
694 if (skip_next_lf) {
695 skip_next_lf = 0;
696 if (c == '\n') {
697 c = *++s;
698 if (!c)
699 break;
700 }
701 }
702 if (c == '\r') {
703 skip_next_lf = 1;
704 c = '\n';
705 }
706 *current = c;
707 }
708 /* If this is exec input, add a newline to the end of the string if
709 there isn't one already. */
710 if (exec_input && c != '\n') {
711 *current = '\n';
712 current++;
713 }
714 *current = '\0';
715 final_length = current - buf + 1;
716 if (final_length < needed_length && final_length)
717 /* should never fail */
718 buf = PyMem_REALLOC(buf, final_length);
719 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000720}
721
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000722/* Decode a byte string STR for use as the buffer of TOK.
723 Look for encoding declarations inside STR, and record them
724 inside TOK. */
725
726static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000727decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000728{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject* utf8 = NULL;
730 const char *str;
731 const char *s;
732 const char *newl[2] = {NULL, NULL};
733 int lineno = 0;
734 tok->input = str = translate_newlines(input, single, tok);
735 if (str == NULL)
736 return NULL;
737 tok->enc = NULL;
738 tok->str = str;
739 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
740 return error_ret(tok);
741 str = tok->str; /* string after BOM if any */
742 assert(str);
743 if (tok->enc != NULL) {
744 utf8 = translate_into_utf8(str, tok->enc);
745 if (utf8 == NULL)
746 return error_ret(tok);
747 str = PyBytes_AsString(utf8);
748 }
749 for (s = str;; s++) {
750 if (*s == '\0') break;
751 else if (*s == '\n') {
752 assert(lineno < 2);
753 newl[lineno] = s;
754 lineno++;
755 if (lineno == 2) break;
756 }
757 }
758 tok->enc = NULL;
759 /* need to check line 1 and 2 separately since check_coding_spec
760 assumes a single line as input */
761 if (newl[0]) {
762 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
763 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200764 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
766 tok, buf_setreadl))
767 return error_ret(tok);
768 }
769 }
770 if (tok->enc != NULL) {
771 assert(utf8 == NULL);
772 utf8 = translate_into_utf8(str, tok->enc);
773 if (utf8 == NULL)
774 return error_ret(tok);
775 str = PyBytes_AS_STRING(utf8);
776 }
777 assert(tok->decoding_buffer == NULL);
778 tok->decoding_buffer = utf8; /* CAUTION */
779 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000780}
781
782#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783
784/* Set up tokenizer for string */
785
786struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000787PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 struct tok_state *tok = tok_new();
790 if (tok == NULL)
791 return NULL;
792 str = (char *)decode_str(str, exec_input, tok);
793 if (str == NULL) {
794 PyTokenizer_Free(tok);
795 return NULL;
796 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000797
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 /* XXX: constify members. */
799 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
800 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000801}
802
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000803struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000804PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000805{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 struct tok_state *tok = tok_new();
807 if (tok == NULL)
808 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000809#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000811#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 if (str == NULL) {
813 PyTokenizer_Free(tok);
814 return NULL;
815 }
816 tok->decoding_state = STATE_RAW;
817 tok->read_coding_spec = 1;
818 tok->enc = NULL;
819 tok->str = str;
820 tok->encoding = (char *)PyMem_MALLOC(6);
821 if (!tok->encoding) {
822 PyTokenizer_Free(tok);
823 return NULL;
824 }
825 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000826
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 /* XXX: constify members. */
828 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
829 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000830}
831
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000832/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000833
834struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000835PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 struct tok_state *tok = tok_new();
838 if (tok == NULL)
839 return NULL;
840 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
841 PyTokenizer_Free(tok);
842 return NULL;
843 }
844 tok->cur = tok->inp = tok->buf;
845 tok->end = tok->buf + BUFSIZ;
846 tok->fp = fp;
847 tok->prompt = ps1;
848 tok->nextprompt = ps2;
849 if (enc != NULL) {
850 /* Must copy encoding declaration since it
851 gets copied into the parse tree. */
852 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
853 if (!tok->encoding) {
854 PyTokenizer_Free(tok);
855 return NULL;
856 }
857 strcpy(tok->encoding, enc);
858 tok->decoding_state = STATE_NORMAL;
859 }
860 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861}
862
863
864/* Free a tok_state structure */
865
866void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000867PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869 if (tok->encoding != NULL)
870 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000871#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000872 Py_XDECREF(tok->decoding_readline);
873 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200874 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000875#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 if (tok->fp != NULL && tok->buf != NULL)
877 PyMem_FREE(tok->buf);
878 if (tok->input)
879 PyMem_FREE((char *)tok->input);
880 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881}
882
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883/* Get next char, updating state; error code goes into tok->done */
884
885static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000886tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 for (;;) {
889 if (tok->cur != tok->inp) {
890 return Py_CHARMASK(*tok->cur++); /* Fast path */
891 }
892 if (tok->done != E_OK)
893 return EOF;
894 if (tok->fp == NULL) {
895 char *end = strchr(tok->inp, '\n');
896 if (end != NULL)
897 end++;
898 else {
899 end = strchr(tok->inp, '\0');
900 if (end == tok->inp) {
901 tok->done = E_EOF;
902 return EOF;
903 }
904 }
905 if (tok->start == NULL)
906 tok->buf = tok->cur;
907 tok->line_start = tok->cur;
908 tok->lineno++;
909 tok->inp = end;
910 return Py_CHARMASK(*tok->cur++);
911 }
912 if (tok->prompt != NULL) {
913 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000914#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000915 if (newtok != NULL) {
916 char *translated = translate_newlines(newtok, 0, tok);
917 PyMem_FREE(newtok);
918 if (translated == NULL)
919 return EOF;
920 newtok = translated;
921 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000922 if (tok->encoding && newtok && *newtok) {
923 /* Recode to UTF-8 */
924 Py_ssize_t buflen;
925 const char* buf;
926 PyObject *u = translate_into_utf8(newtok, tok->encoding);
927 PyMem_FREE(newtok);
928 if (!u) {
929 tok->done = E_DECODE;
930 return EOF;
931 }
932 buflen = PyBytes_GET_SIZE(u);
933 buf = PyBytes_AS_STRING(u);
934 if (!buf) {
935 Py_DECREF(u);
936 tok->done = E_DECODE;
937 return EOF;
938 }
939 newtok = PyMem_MALLOC(buflen+1);
940 strcpy(newtok, buf);
941 Py_DECREF(u);
942 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000943#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000944 if (tok->nextprompt != NULL)
945 tok->prompt = tok->nextprompt;
946 if (newtok == NULL)
947 tok->done = E_INTR;
948 else if (*newtok == '\0') {
949 PyMem_FREE(newtok);
950 tok->done = E_EOF;
951 }
952 else if (tok->start != NULL) {
953 size_t start = tok->start - tok->buf;
954 size_t oldlen = tok->cur - tok->buf;
955 size_t newlen = oldlen + strlen(newtok);
956 char *buf = tok->buf;
957 buf = (char *)PyMem_REALLOC(buf, newlen+1);
958 tok->lineno++;
959 if (buf == NULL) {
960 PyMem_FREE(tok->buf);
961 tok->buf = NULL;
962 PyMem_FREE(newtok);
963 tok->done = E_NOMEM;
964 return EOF;
965 }
966 tok->buf = buf;
967 tok->cur = tok->buf + oldlen;
968 tok->line_start = tok->cur;
969 strcpy(tok->buf + oldlen, newtok);
970 PyMem_FREE(newtok);
971 tok->inp = tok->buf + newlen;
972 tok->end = tok->inp + 1;
973 tok->start = tok->buf + start;
974 }
975 else {
976 tok->lineno++;
977 if (tok->buf != NULL)
978 PyMem_FREE(tok->buf);
979 tok->buf = newtok;
980 tok->line_start = tok->buf;
981 tok->cur = tok->buf;
982 tok->line_start = tok->buf;
983 tok->inp = strchr(tok->buf, '\0');
984 tok->end = tok->inp + 1;
985 }
986 }
987 else {
988 int done = 0;
989 Py_ssize_t cur = 0;
990 char *pt;
991 if (tok->start == NULL) {
992 if (tok->buf == NULL) {
993 tok->buf = (char *)
994 PyMem_MALLOC(BUFSIZ);
995 if (tok->buf == NULL) {
996 tok->done = E_NOMEM;
997 return EOF;
998 }
999 tok->end = tok->buf + BUFSIZ;
1000 }
1001 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1002 tok) == NULL) {
1003 tok->done = E_EOF;
1004 done = 1;
1005 }
1006 else {
1007 tok->done = E_OK;
1008 tok->inp = strchr(tok->buf, '\0');
1009 done = tok->inp[-1] == '\n';
1010 }
1011 }
1012 else {
1013 cur = tok->cur - tok->buf;
1014 if (decoding_feof(tok)) {
1015 tok->done = E_EOF;
1016 done = 1;
1017 }
1018 else
1019 tok->done = E_OK;
1020 }
1021 tok->lineno++;
1022 /* Read until '\n' or EOF */
1023 while (!done) {
1024 Py_ssize_t curstart = tok->start == NULL ? -1 :
1025 tok->start - tok->buf;
1026 Py_ssize_t curvalid = tok->inp - tok->buf;
1027 Py_ssize_t newsize = curvalid + BUFSIZ;
1028 char *newbuf = tok->buf;
1029 newbuf = (char *)PyMem_REALLOC(newbuf,
1030 newsize);
1031 if (newbuf == NULL) {
1032 tok->done = E_NOMEM;
1033 tok->cur = tok->inp;
1034 return EOF;
1035 }
1036 tok->buf = newbuf;
1037 tok->inp = tok->buf + curvalid;
1038 tok->end = tok->buf + newsize;
1039 tok->start = curstart < 0 ? NULL :
1040 tok->buf + curstart;
1041 if (decoding_fgets(tok->inp,
1042 (int)(tok->end - tok->inp),
1043 tok) == NULL) {
1044 /* Break out early on decoding
1045 errors, as tok->buf will be NULL
1046 */
1047 if (tok->decoding_erred)
1048 return EOF;
1049 /* Last line does not end in \n,
1050 fake one */
1051 strcpy(tok->inp, "\n");
1052 }
1053 tok->inp = strchr(tok->inp, '\0');
1054 done = tok->inp[-1] == '\n';
1055 }
1056 if (tok->buf != NULL) {
1057 tok->cur = tok->buf + cur;
1058 tok->line_start = tok->cur;
1059 /* replace "\r\n" with "\n" */
1060 /* For Mac leave the \r, giving a syntax error */
1061 pt = tok->inp - 2;
1062 if (pt >= tok->buf && *pt == '\r') {
1063 *pt++ = '\n';
1064 *pt = '\0';
1065 tok->inp = pt;
1066 }
1067 }
1068 }
1069 if (tok->done != E_OK) {
1070 if (tok->prompt != NULL)
1071 PySys_WriteStderr("\n");
1072 tok->cur = tok->inp;
1073 return EOF;
1074 }
1075 }
1076 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001077}
1078
1079
1080/* Back-up one character */
1081
1082static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001083tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001084{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 if (c != EOF) {
1086 if (--tok->cur < tok->buf)
1087 Py_FatalError("tok_backup: beginning of buffer");
1088 if (*tok->cur != c)
1089 *tok->cur = c;
1090 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091}
1092
1093
1094/* Return the token corresponding to a single character */
1095
1096int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001097PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001098{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 switch (c) {
1100 case '(': return LPAR;
1101 case ')': return RPAR;
1102 case '[': return LSQB;
1103 case ']': return RSQB;
1104 case ':': return COLON;
1105 case ',': return COMMA;
1106 case ';': return SEMI;
1107 case '+': return PLUS;
1108 case '-': return MINUS;
1109 case '*': return STAR;
1110 case '/': return SLASH;
1111 case '|': return VBAR;
1112 case '&': return AMPER;
1113 case '<': return LESS;
1114 case '>': return GREATER;
1115 case '=': return EQUAL;
1116 case '.': return DOT;
1117 case '%': return PERCENT;
1118 case '{': return LBRACE;
1119 case '}': return RBRACE;
1120 case '^': return CIRCUMFLEX;
1121 case '~': return TILDE;
1122 case '@': return AT;
1123 default: return OP;
1124 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001125}
1126
1127
Guido van Rossumfbab9051991-10-20 20:25:03 +00001128int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001129PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001130{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 switch (c1) {
1132 case '=':
1133 switch (c2) {
1134 case '=': return EQEQUAL;
1135 }
1136 break;
1137 case '!':
1138 switch (c2) {
1139 case '=': return NOTEQUAL;
1140 }
1141 break;
1142 case '<':
1143 switch (c2) {
1144 case '>': return NOTEQUAL;
1145 case '=': return LESSEQUAL;
1146 case '<': return LEFTSHIFT;
1147 }
1148 break;
1149 case '>':
1150 switch (c2) {
1151 case '=': return GREATEREQUAL;
1152 case '>': return RIGHTSHIFT;
1153 }
1154 break;
1155 case '+':
1156 switch (c2) {
1157 case '=': return PLUSEQUAL;
1158 }
1159 break;
1160 case '-':
1161 switch (c2) {
1162 case '=': return MINEQUAL;
1163 case '>': return RARROW;
1164 }
1165 break;
1166 case '*':
1167 switch (c2) {
1168 case '*': return DOUBLESTAR;
1169 case '=': return STAREQUAL;
1170 }
1171 break;
1172 case '/':
1173 switch (c2) {
1174 case '/': return DOUBLESLASH;
1175 case '=': return SLASHEQUAL;
1176 }
1177 break;
1178 case '|':
1179 switch (c2) {
1180 case '=': return VBAREQUAL;
1181 }
1182 break;
1183 case '%':
1184 switch (c2) {
1185 case '=': return PERCENTEQUAL;
1186 }
1187 break;
1188 case '&':
1189 switch (c2) {
1190 case '=': return AMPEREQUAL;
1191 }
1192 break;
1193 case '^':
1194 switch (c2) {
1195 case '=': return CIRCUMFLEXEQUAL;
1196 }
1197 break;
1198 }
1199 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001200}
1201
Thomas Wouters434d0822000-08-24 20:11:32 +00001202int
1203PyToken_ThreeChars(int c1, int c2, int c3)
1204{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 switch (c1) {
1206 case '<':
1207 switch (c2) {
1208 case '<':
1209 switch (c3) {
1210 case '=':
1211 return LEFTSHIFTEQUAL;
1212 }
1213 break;
1214 }
1215 break;
1216 case '>':
1217 switch (c2) {
1218 case '>':
1219 switch (c3) {
1220 case '=':
1221 return RIGHTSHIFTEQUAL;
1222 }
1223 break;
1224 }
1225 break;
1226 case '*':
1227 switch (c2) {
1228 case '*':
1229 switch (c3) {
1230 case '=':
1231 return DOUBLESTAREQUAL;
1232 }
1233 break;
1234 }
1235 break;
1236 case '/':
1237 switch (c2) {
1238 case '/':
1239 switch (c3) {
1240 case '=':
1241 return DOUBLESLASHEQUAL;
1242 }
1243 break;
1244 }
1245 break;
1246 case '.':
1247 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001248 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 switch (c3) {
1250 case '.':
1251 return ELLIPSIS;
1252 }
1253 break;
1254 }
1255 break;
1256 }
1257 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001258}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001259
Guido van Rossum926f13a1998-04-09 21:38:06 +00001260static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001261indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001262{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 if (tok->alterror) {
1264 tok->done = E_TABSPACE;
1265 tok->cur = tok->inp;
1266 return 1;
1267 }
1268 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001269#ifdef PGEN
1270 PySys_WriteStderr("inconsistent use of tabs and spaces "
1271 "in indentation\n");
1272#else
1273 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001275#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 tok->altwarning = 0;
1277 }
1278 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001279}
1280
Martin v. Löwis47383402007-08-15 07:32:56 +00001281#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001282#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001283#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284/* Verify that the identifier follows PEP 3131.
1285 All identifier strings are guaranteed to be "ready" unicode objects.
1286 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001287static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001288verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001289{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 PyObject *s;
1291 int result;
1292 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1295 PyErr_Clear();
1296 tok->done = E_IDENTIFIER;
1297 } else {
1298 tok->done = E_ERROR;
1299 }
1300 return 0;
1301 }
1302 result = PyUnicode_IsIdentifier(s);
1303 Py_DECREF(s);
1304 if (result == 0)
1305 tok->done = E_IDENTIFIER;
1306 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001307}
1308#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001309
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310/* Get next token, after space stripping etc. */
1311
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001312static int
1313tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 register int c;
1316 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001317
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001319 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 tok->start = NULL;
1321 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001322
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 /* Get indentation level */
1324 if (tok->atbol) {
1325 register int col = 0;
1326 register int altcol = 0;
1327 tok->atbol = 0;
1328 for (;;) {
1329 c = tok_nextc(tok);
1330 if (c == ' ')
1331 col++, altcol++;
1332 else if (c == '\t') {
1333 col = (col/tok->tabsize + 1) * tok->tabsize;
1334 altcol = (altcol/tok->alttabsize + 1)
1335 * tok->alttabsize;
1336 }
1337 else if (c == '\014') /* Control-L (formfeed) */
1338 col = altcol = 0; /* For Emacs users */
1339 else
1340 break;
1341 }
1342 tok_backup(tok, c);
1343 if (c == '#' || c == '\n') {
1344 /* Lines with only whitespace and/or comments
1345 shouldn't affect the indentation and are
1346 not passed to the parser as NEWLINE tokens,
1347 except *totally* empty lines in interactive
1348 mode, which signal the end of a command group. */
1349 if (col == 0 && c == '\n' && tok->prompt != NULL)
1350 blankline = 0; /* Let it through */
1351 else
1352 blankline = 1; /* Ignore completely */
1353 /* We can't jump back right here since we still
1354 may need to skip to the end of a comment */
1355 }
1356 if (!blankline && tok->level == 0) {
1357 if (col == tok->indstack[tok->indent]) {
1358 /* No change */
1359 if (altcol != tok->altindstack[tok->indent]) {
1360 if (indenterror(tok))
1361 return ERRORTOKEN;
1362 }
1363 }
1364 else if (col > tok->indstack[tok->indent]) {
1365 /* Indent -- always one */
1366 if (tok->indent+1 >= MAXINDENT) {
1367 tok->done = E_TOODEEP;
1368 tok->cur = tok->inp;
1369 return ERRORTOKEN;
1370 }
1371 if (altcol <= tok->altindstack[tok->indent]) {
1372 if (indenterror(tok))
1373 return ERRORTOKEN;
1374 }
1375 tok->pendin++;
1376 tok->indstack[++tok->indent] = col;
1377 tok->altindstack[tok->indent] = altcol;
1378 }
1379 else /* col < tok->indstack[tok->indent] */ {
1380 /* Dedent -- any number, must be consistent */
1381 while (tok->indent > 0 &&
1382 col < tok->indstack[tok->indent]) {
1383 tok->pendin--;
1384 tok->indent--;
1385 }
1386 if (col != tok->indstack[tok->indent]) {
1387 tok->done = E_DEDENT;
1388 tok->cur = tok->inp;
1389 return ERRORTOKEN;
1390 }
1391 if (altcol != tok->altindstack[tok->indent]) {
1392 if (indenterror(tok))
1393 return ERRORTOKEN;
1394 }
1395 }
1396 }
1397 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001398
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 /* Return pending indents/dedents */
1402 if (tok->pendin != 0) {
1403 if (tok->pendin < 0) {
1404 tok->pendin++;
1405 return DEDENT;
1406 }
1407 else {
1408 tok->pendin--;
1409 return INDENT;
1410 }
1411 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 tok->start = NULL;
1415 /* Skip spaces */
1416 do {
1417 c = tok_nextc(tok);
1418 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001420 /* Set start of current token */
1421 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 /* Skip comment */
1424 if (c == '#')
1425 while (c != EOF && c != '\n')
1426 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001427
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 /* Check for EOF and errors now */
1429 if (c == EOF) {
1430 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1431 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 /* Identifier (most frequent token!) */
1434 nonascii = 0;
1435 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001436 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001437 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001438 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001439 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001440 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001441 /* Since this is a backwards compatibility support literal we don't
1442 want to support it in arbitrary order like byte literals. */
1443 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1444 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001445 /* ur"" and ru"" are not supported */
1446 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001447 saw_r = 1;
1448 else
1449 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 c = tok_nextc(tok);
1451 if (c == '"' || c == '\'')
1452 goto letter_quote;
1453 }
1454 while (is_potential_identifier_char(c)) {
1455 if (c >= 128)
1456 nonascii = 1;
1457 c = tok_nextc(tok);
1458 }
1459 tok_backup(tok, c);
1460 if (nonascii &&
1461 !verify_identifier(tok)) {
1462 tok->done = E_IDENTIFIER;
1463 return ERRORTOKEN;
1464 }
1465 *p_start = tok->start;
1466 *p_end = tok->cur;
1467 return NAME;
1468 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001470 /* Newline */
1471 if (c == '\n') {
1472 tok->atbol = 1;
1473 if (blankline || tok->level > 0)
1474 goto nextline;
1475 *p_start = tok->start;
1476 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1477 tok->cont_line = 0;
1478 return NEWLINE;
1479 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001480
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 /* Period or number starting with period? */
1482 if (c == '.') {
1483 c = tok_nextc(tok);
1484 if (isdigit(c)) {
1485 goto fraction;
1486 } else if (c == '.') {
1487 c = tok_nextc(tok);
1488 if (c == '.') {
1489 *p_start = tok->start;
1490 *p_end = tok->cur;
1491 return ELLIPSIS;
1492 } else {
1493 tok_backup(tok, c);
1494 }
1495 tok_backup(tok, '.');
1496 } else {
1497 tok_backup(tok, c);
1498 }
1499 *p_start = tok->start;
1500 *p_end = tok->cur;
1501 return DOT;
1502 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001503
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 /* Number */
1505 if (isdigit(c)) {
1506 if (c == '0') {
1507 /* Hex, octal or binary -- maybe. */
1508 c = tok_nextc(tok);
1509 if (c == '.')
1510 goto fraction;
1511 if (c == 'j' || c == 'J')
1512 goto imaginary;
1513 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001514
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 /* Hex */
1516 c = tok_nextc(tok);
1517 if (!isxdigit(c)) {
1518 tok->done = E_TOKEN;
1519 tok_backup(tok, c);
1520 return ERRORTOKEN;
1521 }
1522 do {
1523 c = tok_nextc(tok);
1524 } while (isxdigit(c));
1525 }
1526 else if (c == 'o' || c == 'O') {
1527 /* Octal */
1528 c = tok_nextc(tok);
1529 if (c < '0' || c >= '8') {
1530 tok->done = E_TOKEN;
1531 tok_backup(tok, c);
1532 return ERRORTOKEN;
1533 }
1534 do {
1535 c = tok_nextc(tok);
1536 } while ('0' <= c && c < '8');
1537 }
1538 else if (c == 'b' || c == 'B') {
1539 /* Binary */
1540 c = tok_nextc(tok);
1541 if (c != '0' && c != '1') {
1542 tok->done = E_TOKEN;
1543 tok_backup(tok, c);
1544 return ERRORTOKEN;
1545 }
1546 do {
1547 c = tok_nextc(tok);
1548 } while (c == '0' || c == '1');
1549 }
1550 else {
1551 int nonzero = 0;
1552 /* maybe old-style octal; c is first char of it */
1553 /* in any case, allow '0' as a literal */
1554 while (c == '0')
1555 c = tok_nextc(tok);
1556 while (isdigit(c)) {
1557 nonzero = 1;
1558 c = tok_nextc(tok);
1559 }
1560 if (c == '.')
1561 goto fraction;
1562 else if (c == 'e' || c == 'E')
1563 goto exponent;
1564 else if (c == 'j' || c == 'J')
1565 goto imaginary;
1566 else if (nonzero) {
1567 tok->done = E_TOKEN;
1568 tok_backup(tok, c);
1569 return ERRORTOKEN;
1570 }
1571 }
1572 }
1573 else {
1574 /* Decimal */
1575 do {
1576 c = tok_nextc(tok);
1577 } while (isdigit(c));
1578 {
1579 /* Accept floating point numbers. */
1580 if (c == '.') {
1581 fraction:
1582 /* Fraction */
1583 do {
1584 c = tok_nextc(tok);
1585 } while (isdigit(c));
1586 }
1587 if (c == 'e' || c == 'E') {
1588 exponent:
1589 /* Exponent part */
1590 c = tok_nextc(tok);
1591 if (c == '+' || c == '-')
1592 c = tok_nextc(tok);
1593 if (!isdigit(c)) {
1594 tok->done = E_TOKEN;
1595 tok_backup(tok, c);
1596 return ERRORTOKEN;
1597 }
1598 do {
1599 c = tok_nextc(tok);
1600 } while (isdigit(c));
1601 }
1602 if (c == 'j' || c == 'J')
1603 /* Imaginary part */
1604 imaginary:
1605 c = tok_nextc(tok);
1606 }
1607 }
1608 tok_backup(tok, c);
1609 *p_start = tok->start;
1610 *p_end = tok->cur;
1611 return NUMBER;
1612 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001613
1614 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001615 /* String */
1616 if (c == '\'' || c == '"') {
1617 int quote = c;
1618 int quote_size = 1; /* 1 or 3 */
1619 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001620
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001621 /* Find the quote size and start of string */
1622 c = tok_nextc(tok);
1623 if (c == quote) {
1624 c = tok_nextc(tok);
1625 if (c == quote)
1626 quote_size = 3;
1627 else
1628 end_quote_size = 1; /* empty string found */
1629 }
1630 if (c != quote)
1631 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001632
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 /* Get rest of string */
1634 while (end_quote_size != quote_size) {
1635 c = tok_nextc(tok);
1636 if (c == EOF) {
1637 if (quote_size == 3)
1638 tok->done = E_EOFS;
1639 else
1640 tok->done = E_EOLS;
1641 tok->cur = tok->inp;
1642 return ERRORTOKEN;
1643 }
1644 if (quote_size == 1 && c == '\n') {
1645 tok->done = E_EOLS;
1646 tok->cur = tok->inp;
1647 return ERRORTOKEN;
1648 }
1649 if (c == quote)
1650 end_quote_size += 1;
1651 else {
1652 end_quote_size = 0;
1653 if (c == '\\')
1654 c = tok_nextc(tok); /* skip escaped char */
1655 }
1656 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001657
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 *p_start = tok->start;
1659 *p_end = tok->cur;
1660 return STRING;
1661 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001662
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 /* Line continuation */
1664 if (c == '\\') {
1665 c = tok_nextc(tok);
1666 if (c != '\n') {
1667 tok->done = E_LINECONT;
1668 tok->cur = tok->inp;
1669 return ERRORTOKEN;
1670 }
1671 tok->cont_line = 1;
1672 goto again; /* Read next line */
1673 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001674
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 /* Check for two-character token */
1676 {
1677 int c2 = tok_nextc(tok);
1678 int token = PyToken_TwoChars(c, c2);
1679 if (token != OP) {
1680 int c3 = tok_nextc(tok);
1681 int token3 = PyToken_ThreeChars(c, c2, c3);
1682 if (token3 != OP) {
1683 token = token3;
1684 } else {
1685 tok_backup(tok, c3);
1686 }
1687 *p_start = tok->start;
1688 *p_end = tok->cur;
1689 return token;
1690 }
1691 tok_backup(tok, c2);
1692 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001693
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001694 /* Keep track of parentheses nesting level */
1695 switch (c) {
1696 case '(':
1697 case '[':
1698 case '{':
1699 tok->level++;
1700 break;
1701 case ')':
1702 case ']':
1703 case '}':
1704 tok->level--;
1705 break;
1706 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001707
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 /* Punctuation character */
1709 *p_start = tok->start;
1710 *p_end = tok->cur;
1711 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001712}
1713
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001714int
1715PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1716{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 int result = tok_get(tok, p_start, p_end);
1718 if (tok->decoding_erred) {
1719 result = ERRORTOKEN;
1720 tok->done = E_DECODE;
1721 }
1722 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001723}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001724
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001725/* Get the encoding of a Python file. Check for the coding cookie and check if
1726 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001727
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001728 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1729 encoding in the first or second line of the file (in which case the encoding
1730 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001731
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001732 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1733 by the caller. */
1734
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001735char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001736PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001737{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 struct tok_state *tok;
1739 FILE *fp;
1740 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001741
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 fd = dup(fd);
1743 if (fd < 0) {
1744 return NULL;
1745 }
1746 fp = fdopen(fd, "r");
1747 if (fp == NULL) {
1748 return NULL;
1749 }
1750 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1751 if (tok == NULL) {
1752 fclose(fp);
1753 return NULL;
1754 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001755#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001756 if (filename != NULL) {
1757 Py_INCREF(filename);
1758 tok->filename = filename;
1759 }
1760 else {
1761 tok->filename = PyUnicode_FromString("<string>");
1762 if (tok->filename == NULL) {
1763 fclose(fp);
1764 PyTokenizer_Free(tok);
1765 return encoding;
1766 }
1767 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001768#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 while (tok->lineno < 2 && tok->done == E_OK) {
1770 PyTokenizer_Get(tok, &p_start, &p_end);
1771 }
1772 fclose(fp);
1773 if (tok->encoding) {
1774 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1775 if (encoding)
1776 strcpy(encoding, tok->encoding);
1777 }
1778 PyTokenizer_Free(tok);
1779 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001780}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001781
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001782char *
1783PyTokenizer_FindEncoding(int fd)
1784{
1785 return PyTokenizer_FindEncodingFilename(fd, NULL);
1786}
1787
Guido van Rossum408027e1996-12-30 16:17:54 +00001788#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001789
1790void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001791tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001792{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001793 printf("%s", _PyParser_TokenNames[type]);
1794 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1795 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001796}
1797
1798#endif