blob: 5bf7e84f26163489d923bbcef0a22a1954b2b426 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 if (tok->cont_line)
287 /* It's a continuation line, so it can't be a coding spec. */
288 return 1;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700289 if (!get_coding_spec(line, &cs, size, tok))
290 return 0;
291 if (!cs)
292 return 1;
293 tok->read_coding_spec = 1;
294 if (tok->encoding == NULL) {
295 assert(tok->decoding_state == STATE_RAW);
296 if (strcmp(cs, "utf-8") == 0) {
297 tok->encoding = cs;
298 } else {
299 r = set_readline(tok, cs);
300 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700302 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700304 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300305 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700306 "encoding problem: %s", cs);
307 PyMem_FREE(cs);
308 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000309 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700310 } else { /* then, compare cs with BOM */
311 r = (strcmp(tok->encoding, cs) == 0);
312 if (!r)
313 PyErr_Format(PyExc_SyntaxError,
314 "encoding problem: %s with BOM", cs);
315 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318}
319
320/* See whether the file starts with a BOM. If it does,
321 invoke the set_readline function with the new encoding.
322 Return 1 on success, 0 on failure. */
323
324static int
325check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 void unget_char(int, struct tok_state *),
327 int set_readline(struct tok_state *, const char *),
328 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000329{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 int ch1, ch2, ch3;
331 ch1 = get_char(tok);
332 tok->decoding_state = STATE_RAW;
333 if (ch1 == EOF) {
334 return 1;
335 } else if (ch1 == 0xEF) {
336 ch2 = get_char(tok);
337 if (ch2 != 0xBB) {
338 unget_char(ch2, tok);
339 unget_char(ch1, tok);
340 return 1;
341 }
342 ch3 = get_char(tok);
343 if (ch3 != 0xBF) {
344 unget_char(ch3, tok);
345 unget_char(ch2, tok);
346 unget_char(ch1, tok);
347 return 1;
348 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 /* Disable support for UTF-16 BOMs until a decision
351 is made whether this needs to be supported. */
352 } else if (ch1 == 0xFE) {
353 ch2 = get_char(tok);
354 if (ch2 != 0xFF) {
355 unget_char(ch2, tok);
356 unget_char(ch1, tok);
357 return 1;
358 }
359 if (!set_readline(tok, "utf-16-be"))
360 return 0;
361 tok->decoding_state = STATE_NORMAL;
362 } else if (ch1 == 0xFF) {
363 ch2 = get_char(tok);
364 if (ch2 != 0xFE) {
365 unget_char(ch2, tok);
366 unget_char(ch1, tok);
367 return 1;
368 }
369 if (!set_readline(tok, "utf-16-le"))
370 return 0;
371 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 } else {
374 unget_char(ch1, tok);
375 return 1;
376 }
377 if (tok->encoding != NULL)
378 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700379 tok->encoding = new_string("utf-8", 5, tok);
380 if (!tok->encoding)
381 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 /* No need to set_readline: input is already utf-8 */
383 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384}
385
386/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 On entry, tok->decoding_buffer will be one of:
390 1) NULL: need to call tok->decoding_readline to get a new line
391 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000393 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 (in the s buffer) to copy entire contents of the line read
395 by tok->decoding_readline. tok->decoding_buffer has the overflow.
396 In this case, fp_readl is called in a loop (with an expanded buffer)
397 until the buffer ends with a '\n' (or until the end of the file is
398 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400
401static char *
402fp_readl(char *s, int size, struct tok_state *tok)
403{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyObject* bufobj;
405 const char *buf;
406 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 /* Ask for one less byte so we can terminate it */
409 assert(size > 0);
410 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000412 if (tok->decoding_buffer) {
413 bufobj = tok->decoding_buffer;
414 Py_INCREF(bufobj);
415 }
416 else
417 {
418 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (bufobj == NULL)
420 goto error;
421 }
422 if (PyUnicode_CheckExact(bufobj))
423 {
424 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
425 if (buf == NULL) {
426 goto error;
427 }
428 }
429 else
430 {
431 buf = PyByteArray_AsString(bufobj);
432 if (buf == NULL) {
433 goto error;
434 }
435 buflen = PyByteArray_GET_SIZE(bufobj);
436 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000437
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438 Py_XDECREF(tok->decoding_buffer);
439 if (buflen > size) {
440 /* Too many chars, the rest goes into tok->decoding_buffer */
441 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
442 buflen-size);
443 if (tok->decoding_buffer == NULL)
444 goto error;
445 buflen = size;
446 }
447 else
448 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000449
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000450 memcpy(s, buf, buflen);
451 s[buflen] = '\0';
452 if (buflen == 0) /* EOF */
453 s = NULL;
454 Py_DECREF(bufobj);
455 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000456
457error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 Py_XDECREF(bufobj);
459 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460}
461
462/* Set the readline function for TOK to a StreamReader's
463 readline function. The StreamReader is named ENC.
464
465 This function is called from check_bom and check_coding_spec.
466
467 ENC is usually identical to the future value of tok->encoding,
468 except for the (currently unsupported) case of UTF-16.
469
470 Return 1 on success, 0 on failure. */
471
472static int
473fp_setreadl(struct tok_state *tok, const char* enc)
474{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200476 _Py_IDENTIFIER(open);
477 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000478 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 io = PyImport_ImportModuleNoBlock("io");
481 if (io == NULL)
482 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000483
Victor Stinner22a351a2010-10-14 12:04:34 +0000484 fd = fileno(tok->fp);
485 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
486 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
487 goto cleanup;
488 }
489
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200490 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000491 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 if (stream == NULL)
493 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200496 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 /* The file has been reopened; parsing will restart from
500 * the beginning of the file, we have to reset the line number.
501 * But this function has been called from inside tok_nextc() which
502 * will increment lineno before it returns. So we set it -1 so that
503 * the next call to tok_nextc() will start with tok->lineno == 0.
504 */
505 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000506
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000507 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 Py_XDECREF(stream);
509 Py_XDECREF(io);
510 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000511}
512
513/* Fetch the next byte from TOK. */
514
515static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000516 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517}
518
519/* Unfetch the last byte back into TOK. */
520
521static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523}
524
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000525/* Check whether the characters at s start a valid
526 UTF-8 sequence. Return the number of characters forming
527 the sequence if yes, 0 if not. */
528static int valid_utf8(const unsigned char* s)
529{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 int expected = 0;
531 int length;
532 if (*s < 0x80)
533 /* single-byte code */
534 return 1;
535 if (*s < 0xc0)
536 /* following byte */
537 return 0;
538 if (*s < 0xE0)
539 expected = 1;
540 else if (*s < 0xF0)
541 expected = 2;
542 else if (*s < 0xF8)
543 expected = 3;
544 else
545 return 0;
546 length = expected + 1;
547 for (; expected; expected--)
548 if (s[expected] < 0x80 || s[expected] >= 0xC0)
549 return 0;
550 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000551}
552
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553/* Read a line of input from TOK. Determine encoding
554 if necessary. */
555
556static char *
557decoding_fgets(char *s, int size, struct tok_state *tok)
558{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000559 char *line = NULL;
560 int badchar = 0;
561 for (;;) {
562 if (tok->decoding_state == STATE_NORMAL) {
563 /* We already have a codec associated with
564 this input. */
565 line = fp_readl(s, size, tok);
566 break;
567 } else if (tok->decoding_state == STATE_RAW) {
568 /* We want a 'raw' read. */
569 line = Py_UniversalNewlineFgets(s, size,
570 tok->fp, NULL);
571 break;
572 } else {
573 /* We have not yet determined the encoding.
574 If an encoding is found, use the file-pointer
575 reader functions from now on. */
576 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
577 return error_ret(tok);
578 assert(tok->decoding_state != STATE_INIT);
579 }
580 }
581 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
582 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
583 return error_ret(tok);
584 }
585 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 /* The default encoding is UTF-8, so make sure we don't have any
588 non-UTF-8 sequences in it. */
589 if (line && !tok->encoding) {
590 unsigned char *c;
591 int length;
592 for (c = (unsigned char *)line; *c; c += length)
593 if (!(length = valid_utf8(c))) {
594 badchar = *c;
595 break;
596 }
597 }
598 if (badchar) {
599 /* Need to add 1 to the line number, since this line
600 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200601 PyErr_Format(PyExc_SyntaxError,
602 "Non-UTF-8 code starting with '\\x%.2x' "
603 "in file %U on line %i, "
604 "but no encoding declared; "
605 "see http://python.org/dev/peps/pep-0263/ for details",
606 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 return error_ret(tok);
608 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611}
612
613static int
614decoding_feof(struct tok_state *tok)
615{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 if (tok->decoding_state != STATE_NORMAL) {
617 return feof(tok->fp);
618 } else {
619 PyObject* buf = tok->decoding_buffer;
620 if (buf == NULL) {
621 buf = PyObject_CallObject(tok->decoding_readline, NULL);
622 if (buf == NULL) {
623 error_ret(tok);
624 return 1;
625 } else {
626 tok->decoding_buffer = buf;
627 }
628 }
629 return PyObject_Length(buf) == 0;
630 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000631}
632
633/* Fetch a byte from TOK, using the string buffer. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638}
639
640/* Unfetch a byte from TOK, using the string buffer. */
641
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000642static void
643buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 tok->str--;
645 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646}
647
648/* Set the readline function for TOK to ENC. For the string-based
649 tokenizer, this means to just record the encoding. */
650
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000651static int
652buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 tok->enc = enc;
654 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655}
656
657/* Return a UTF-8 encoding Python string object from the
658 C byte string STR, which is encoded with ENC. */
659
660static PyObject *
661translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662 PyObject *utf8;
663 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
664 if (buf == NULL)
665 return NULL;
666 utf8 = PyUnicode_AsUTF8String(buf);
667 Py_DECREF(buf);
668 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669}
670
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000671
672static char *
673translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200674 int skip_next_lf = 0;
675 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 char *buf, *current;
677 char c = '\0';
678 buf = PyMem_MALLOC(needed_length);
679 if (buf == NULL) {
680 tok->done = E_NOMEM;
681 return NULL;
682 }
683 for (current = buf; *s; s++, current++) {
684 c = *s;
685 if (skip_next_lf) {
686 skip_next_lf = 0;
687 if (c == '\n') {
688 c = *++s;
689 if (!c)
690 break;
691 }
692 }
693 if (c == '\r') {
694 skip_next_lf = 1;
695 c = '\n';
696 }
697 *current = c;
698 }
699 /* If this is exec input, add a newline to the end of the string if
700 there isn't one already. */
701 if (exec_input && c != '\n') {
702 *current = '\n';
703 current++;
704 }
705 *current = '\0';
706 final_length = current - buf + 1;
707 if (final_length < needed_length && final_length)
708 /* should never fail */
709 buf = PyMem_REALLOC(buf, final_length);
710 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000711}
712
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713/* Decode a byte string STR for use as the buffer of TOK.
714 Look for encoding declarations inside STR, and record them
715 inside TOK. */
716
717static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000718decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 PyObject* utf8 = NULL;
721 const char *str;
722 const char *s;
723 const char *newl[2] = {NULL, NULL};
724 int lineno = 0;
725 tok->input = str = translate_newlines(input, single, tok);
726 if (str == NULL)
727 return NULL;
728 tok->enc = NULL;
729 tok->str = str;
730 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
731 return error_ret(tok);
732 str = tok->str; /* string after BOM if any */
733 assert(str);
734 if (tok->enc != NULL) {
735 utf8 = translate_into_utf8(str, tok->enc);
736 if (utf8 == NULL)
737 return error_ret(tok);
738 str = PyBytes_AsString(utf8);
739 }
740 for (s = str;; s++) {
741 if (*s == '\0') break;
742 else if (*s == '\n') {
743 assert(lineno < 2);
744 newl[lineno] = s;
745 lineno++;
746 if (lineno == 2) break;
747 }
748 }
749 tok->enc = NULL;
750 /* need to check line 1 and 2 separately since check_coding_spec
751 assumes a single line as input */
752 if (newl[0]) {
753 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
754 return error_ret(tok);
755 if (tok->enc == NULL && newl[1]) {
756 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
757 tok, buf_setreadl))
758 return error_ret(tok);
759 }
760 }
761 if (tok->enc != NULL) {
762 assert(utf8 == NULL);
763 utf8 = translate_into_utf8(str, tok->enc);
764 if (utf8 == NULL)
765 return error_ret(tok);
766 str = PyBytes_AS_STRING(utf8);
767 }
768 assert(tok->decoding_buffer == NULL);
769 tok->decoding_buffer = utf8; /* CAUTION */
770 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000771}
772
773#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774
775/* Set up tokenizer for string */
776
777struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000778PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 struct tok_state *tok = tok_new();
781 if (tok == NULL)
782 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300783 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 if (str == NULL) {
785 PyTokenizer_Free(tok);
786 return NULL;
787 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000788
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 /* XXX: constify members. */
790 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
791 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000792}
793
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000794struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000795PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000796{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 struct tok_state *tok = tok_new();
798 if (tok == NULL)
799 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000800#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000802#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 if (str == NULL) {
804 PyTokenizer_Free(tok);
805 return NULL;
806 }
807 tok->decoding_state = STATE_RAW;
808 tok->read_coding_spec = 1;
809 tok->enc = NULL;
810 tok->str = str;
811 tok->encoding = (char *)PyMem_MALLOC(6);
812 if (!tok->encoding) {
813 PyTokenizer_Free(tok);
814 return NULL;
815 }
816 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000817
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 /* XXX: constify members. */
819 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
820 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000821}
822
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000823/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000824
825struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300826PyTokenizer_FromFile(FILE *fp, const char* enc,
827 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 struct tok_state *tok = tok_new();
830 if (tok == NULL)
831 return NULL;
832 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
833 PyTokenizer_Free(tok);
834 return NULL;
835 }
836 tok->cur = tok->inp = tok->buf;
837 tok->end = tok->buf + BUFSIZ;
838 tok->fp = fp;
839 tok->prompt = ps1;
840 tok->nextprompt = ps2;
841 if (enc != NULL) {
842 /* Must copy encoding declaration since it
843 gets copied into the parse tree. */
844 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
845 if (!tok->encoding) {
846 PyTokenizer_Free(tok);
847 return NULL;
848 }
849 strcpy(tok->encoding, enc);
850 tok->decoding_state = STATE_NORMAL;
851 }
852 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853}
854
855
856/* Free a tok_state structure */
857
858void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000859PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 if (tok->encoding != NULL)
862 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000863#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 Py_XDECREF(tok->decoding_readline);
865 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200866 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000867#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000868 if (tok->fp != NULL && tok->buf != NULL)
869 PyMem_FREE(tok->buf);
870 if (tok->input)
871 PyMem_FREE((char *)tok->input);
872 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873}
874
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875/* Get next char, updating state; error code goes into tok->done */
876
877static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200878tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000879{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000880 for (;;) {
881 if (tok->cur != tok->inp) {
882 return Py_CHARMASK(*tok->cur++); /* Fast path */
883 }
884 if (tok->done != E_OK)
885 return EOF;
886 if (tok->fp == NULL) {
887 char *end = strchr(tok->inp, '\n');
888 if (end != NULL)
889 end++;
890 else {
891 end = strchr(tok->inp, '\0');
892 if (end == tok->inp) {
893 tok->done = E_EOF;
894 return EOF;
895 }
896 }
897 if (tok->start == NULL)
898 tok->buf = tok->cur;
899 tok->line_start = tok->cur;
900 tok->lineno++;
901 tok->inp = end;
902 return Py_CHARMASK(*tok->cur++);
903 }
904 if (tok->prompt != NULL) {
905 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000906#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000907 if (newtok != NULL) {
908 char *translated = translate_newlines(newtok, 0, tok);
909 PyMem_FREE(newtok);
910 if (translated == NULL)
911 return EOF;
912 newtok = translated;
913 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 if (tok->encoding && newtok && *newtok) {
915 /* Recode to UTF-8 */
916 Py_ssize_t buflen;
917 const char* buf;
918 PyObject *u = translate_into_utf8(newtok, tok->encoding);
919 PyMem_FREE(newtok);
920 if (!u) {
921 tok->done = E_DECODE;
922 return EOF;
923 }
924 buflen = PyBytes_GET_SIZE(u);
925 buf = PyBytes_AS_STRING(u);
926 if (!buf) {
927 Py_DECREF(u);
928 tok->done = E_DECODE;
929 return EOF;
930 }
931 newtok = PyMem_MALLOC(buflen+1);
932 strcpy(newtok, buf);
933 Py_DECREF(u);
934 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000935#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000936 if (tok->nextprompt != NULL)
937 tok->prompt = tok->nextprompt;
938 if (newtok == NULL)
939 tok->done = E_INTR;
940 else if (*newtok == '\0') {
941 PyMem_FREE(newtok);
942 tok->done = E_EOF;
943 }
944 else if (tok->start != NULL) {
945 size_t start = tok->start - tok->buf;
946 size_t oldlen = tok->cur - tok->buf;
947 size_t newlen = oldlen + strlen(newtok);
948 char *buf = tok->buf;
949 buf = (char *)PyMem_REALLOC(buf, newlen+1);
950 tok->lineno++;
951 if (buf == NULL) {
952 PyMem_FREE(tok->buf);
953 tok->buf = NULL;
954 PyMem_FREE(newtok);
955 tok->done = E_NOMEM;
956 return EOF;
957 }
958 tok->buf = buf;
959 tok->cur = tok->buf + oldlen;
960 tok->line_start = tok->cur;
961 strcpy(tok->buf + oldlen, newtok);
962 PyMem_FREE(newtok);
963 tok->inp = tok->buf + newlen;
964 tok->end = tok->inp + 1;
965 tok->start = tok->buf + start;
966 }
967 else {
968 tok->lineno++;
969 if (tok->buf != NULL)
970 PyMem_FREE(tok->buf);
971 tok->buf = newtok;
972 tok->line_start = tok->buf;
973 tok->cur = tok->buf;
974 tok->line_start = tok->buf;
975 tok->inp = strchr(tok->buf, '\0');
976 tok->end = tok->inp + 1;
977 }
978 }
979 else {
980 int done = 0;
981 Py_ssize_t cur = 0;
982 char *pt;
983 if (tok->start == NULL) {
984 if (tok->buf == NULL) {
985 tok->buf = (char *)
986 PyMem_MALLOC(BUFSIZ);
987 if (tok->buf == NULL) {
988 tok->done = E_NOMEM;
989 return EOF;
990 }
991 tok->end = tok->buf + BUFSIZ;
992 }
993 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
994 tok) == NULL) {
995 tok->done = E_EOF;
996 done = 1;
997 }
998 else {
999 tok->done = E_OK;
1000 tok->inp = strchr(tok->buf, '\0');
1001 done = tok->inp[-1] == '\n';
1002 }
1003 }
1004 else {
1005 cur = tok->cur - tok->buf;
1006 if (decoding_feof(tok)) {
1007 tok->done = E_EOF;
1008 done = 1;
1009 }
1010 else
1011 tok->done = E_OK;
1012 }
1013 tok->lineno++;
1014 /* Read until '\n' or EOF */
1015 while (!done) {
1016 Py_ssize_t curstart = tok->start == NULL ? -1 :
1017 tok->start - tok->buf;
1018 Py_ssize_t curvalid = tok->inp - tok->buf;
1019 Py_ssize_t newsize = curvalid + BUFSIZ;
1020 char *newbuf = tok->buf;
1021 newbuf = (char *)PyMem_REALLOC(newbuf,
1022 newsize);
1023 if (newbuf == NULL) {
1024 tok->done = E_NOMEM;
1025 tok->cur = tok->inp;
1026 return EOF;
1027 }
1028 tok->buf = newbuf;
1029 tok->inp = tok->buf + curvalid;
1030 tok->end = tok->buf + newsize;
1031 tok->start = curstart < 0 ? NULL :
1032 tok->buf + curstart;
1033 if (decoding_fgets(tok->inp,
1034 (int)(tok->end - tok->inp),
1035 tok) == NULL) {
1036 /* Break out early on decoding
1037 errors, as tok->buf will be NULL
1038 */
1039 if (tok->decoding_erred)
1040 return EOF;
1041 /* Last line does not end in \n,
1042 fake one */
1043 strcpy(tok->inp, "\n");
1044 }
1045 tok->inp = strchr(tok->inp, '\0');
1046 done = tok->inp[-1] == '\n';
1047 }
1048 if (tok->buf != NULL) {
1049 tok->cur = tok->buf + cur;
1050 tok->line_start = tok->cur;
1051 /* replace "\r\n" with "\n" */
1052 /* For Mac leave the \r, giving a syntax error */
1053 pt = tok->inp - 2;
1054 if (pt >= tok->buf && *pt == '\r') {
1055 *pt++ = '\n';
1056 *pt = '\0';
1057 tok->inp = pt;
1058 }
1059 }
1060 }
1061 if (tok->done != E_OK) {
1062 if (tok->prompt != NULL)
1063 PySys_WriteStderr("\n");
1064 tok->cur = tok->inp;
1065 return EOF;
1066 }
1067 }
1068 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001069}
1070
1071
1072/* Back-up one character */
1073
1074static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001075tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 if (c != EOF) {
1078 if (--tok->cur < tok->buf)
1079 Py_FatalError("tok_backup: beginning of buffer");
1080 if (*tok->cur != c)
1081 *tok->cur = c;
1082 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001083}
1084
1085
1086/* Return the token corresponding to a single character */
1087
1088int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001089PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001090{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 switch (c) {
1092 case '(': return LPAR;
1093 case ')': return RPAR;
1094 case '[': return LSQB;
1095 case ']': return RSQB;
1096 case ':': return COLON;
1097 case ',': return COMMA;
1098 case ';': return SEMI;
1099 case '+': return PLUS;
1100 case '-': return MINUS;
1101 case '*': return STAR;
1102 case '/': return SLASH;
1103 case '|': return VBAR;
1104 case '&': return AMPER;
1105 case '<': return LESS;
1106 case '>': return GREATER;
1107 case '=': return EQUAL;
1108 case '.': return DOT;
1109 case '%': return PERCENT;
1110 case '{': return LBRACE;
1111 case '}': return RBRACE;
1112 case '^': return CIRCUMFLEX;
1113 case '~': return TILDE;
1114 case '@': return AT;
1115 default: return OP;
1116 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117}
1118
1119
Guido van Rossumfbab9051991-10-20 20:25:03 +00001120int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001121PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001122{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 switch (c1) {
1124 case '=':
1125 switch (c2) {
1126 case '=': return EQEQUAL;
1127 }
1128 break;
1129 case '!':
1130 switch (c2) {
1131 case '=': return NOTEQUAL;
1132 }
1133 break;
1134 case '<':
1135 switch (c2) {
1136 case '>': return NOTEQUAL;
1137 case '=': return LESSEQUAL;
1138 case '<': return LEFTSHIFT;
1139 }
1140 break;
1141 case '>':
1142 switch (c2) {
1143 case '=': return GREATEREQUAL;
1144 case '>': return RIGHTSHIFT;
1145 }
1146 break;
1147 case '+':
1148 switch (c2) {
1149 case '=': return PLUSEQUAL;
1150 }
1151 break;
1152 case '-':
1153 switch (c2) {
1154 case '=': return MINEQUAL;
1155 case '>': return RARROW;
1156 }
1157 break;
1158 case '*':
1159 switch (c2) {
1160 case '*': return DOUBLESTAR;
1161 case '=': return STAREQUAL;
1162 }
1163 break;
1164 case '/':
1165 switch (c2) {
1166 case '/': return DOUBLESLASH;
1167 case '=': return SLASHEQUAL;
1168 }
1169 break;
1170 case '|':
1171 switch (c2) {
1172 case '=': return VBAREQUAL;
1173 }
1174 break;
1175 case '%':
1176 switch (c2) {
1177 case '=': return PERCENTEQUAL;
1178 }
1179 break;
1180 case '&':
1181 switch (c2) {
1182 case '=': return AMPEREQUAL;
1183 }
1184 break;
1185 case '^':
1186 switch (c2) {
1187 case '=': return CIRCUMFLEXEQUAL;
1188 }
1189 break;
1190 }
1191 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001192}
1193
Thomas Wouters434d0822000-08-24 20:11:32 +00001194int
1195PyToken_ThreeChars(int c1, int c2, int c3)
1196{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 switch (c1) {
1198 case '<':
1199 switch (c2) {
1200 case '<':
1201 switch (c3) {
1202 case '=':
1203 return LEFTSHIFTEQUAL;
1204 }
1205 break;
1206 }
1207 break;
1208 case '>':
1209 switch (c2) {
1210 case '>':
1211 switch (c3) {
1212 case '=':
1213 return RIGHTSHIFTEQUAL;
1214 }
1215 break;
1216 }
1217 break;
1218 case '*':
1219 switch (c2) {
1220 case '*':
1221 switch (c3) {
1222 case '=':
1223 return DOUBLESTAREQUAL;
1224 }
1225 break;
1226 }
1227 break;
1228 case '/':
1229 switch (c2) {
1230 case '/':
1231 switch (c3) {
1232 case '=':
1233 return DOUBLESLASHEQUAL;
1234 }
1235 break;
1236 }
1237 break;
1238 case '.':
1239 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001240 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001241 switch (c3) {
1242 case '.':
1243 return ELLIPSIS;
1244 }
1245 break;
1246 }
1247 break;
1248 }
1249 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001250}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001251
Guido van Rossum926f13a1998-04-09 21:38:06 +00001252static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001253indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001254{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255 if (tok->alterror) {
1256 tok->done = E_TABSPACE;
1257 tok->cur = tok->inp;
1258 return 1;
1259 }
1260 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001261#ifdef PGEN
1262 PySys_WriteStderr("inconsistent use of tabs and spaces "
1263 "in indentation\n");
1264#else
1265 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001267#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 tok->altwarning = 0;
1269 }
1270 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001271}
1272
Martin v. Löwis47383402007-08-15 07:32:56 +00001273#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001274#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001275#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276/* Verify that the identifier follows PEP 3131.
1277 All identifier strings are guaranteed to be "ready" unicode objects.
1278 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001279static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001280verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001281{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 PyObject *s;
1283 int result;
1284 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1287 PyErr_Clear();
1288 tok->done = E_IDENTIFIER;
1289 } else {
1290 tok->done = E_ERROR;
1291 }
1292 return 0;
1293 }
1294 result = PyUnicode_IsIdentifier(s);
1295 Py_DECREF(s);
1296 if (result == 0)
1297 tok->done = E_IDENTIFIER;
1298 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001299}
1300#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001301
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302/* Get next token, after space stripping etc. */
1303
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001304static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001305tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001307 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001309
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001311 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 tok->start = NULL;
1313 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001314
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 /* Get indentation level */
1316 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001317 int col = 0;
1318 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 tok->atbol = 0;
1320 for (;;) {
1321 c = tok_nextc(tok);
1322 if (c == ' ')
1323 col++, altcol++;
1324 else if (c == '\t') {
1325 col = (col/tok->tabsize + 1) * tok->tabsize;
1326 altcol = (altcol/tok->alttabsize + 1)
1327 * tok->alttabsize;
1328 }
1329 else if (c == '\014') /* Control-L (formfeed) */
1330 col = altcol = 0; /* For Emacs users */
1331 else
1332 break;
1333 }
1334 tok_backup(tok, c);
1335 if (c == '#' || c == '\n') {
1336 /* Lines with only whitespace and/or comments
1337 shouldn't affect the indentation and are
1338 not passed to the parser as NEWLINE tokens,
1339 except *totally* empty lines in interactive
1340 mode, which signal the end of a command group. */
1341 if (col == 0 && c == '\n' && tok->prompt != NULL)
1342 blankline = 0; /* Let it through */
1343 else
1344 blankline = 1; /* Ignore completely */
1345 /* We can't jump back right here since we still
1346 may need to skip to the end of a comment */
1347 }
1348 if (!blankline && tok->level == 0) {
1349 if (col == tok->indstack[tok->indent]) {
1350 /* No change */
1351 if (altcol != tok->altindstack[tok->indent]) {
1352 if (indenterror(tok))
1353 return ERRORTOKEN;
1354 }
1355 }
1356 else if (col > tok->indstack[tok->indent]) {
1357 /* Indent -- always one */
1358 if (tok->indent+1 >= MAXINDENT) {
1359 tok->done = E_TOODEEP;
1360 tok->cur = tok->inp;
1361 return ERRORTOKEN;
1362 }
1363 if (altcol <= tok->altindstack[tok->indent]) {
1364 if (indenterror(tok))
1365 return ERRORTOKEN;
1366 }
1367 tok->pendin++;
1368 tok->indstack[++tok->indent] = col;
1369 tok->altindstack[tok->indent] = altcol;
1370 }
1371 else /* col < tok->indstack[tok->indent] */ {
1372 /* Dedent -- any number, must be consistent */
1373 while (tok->indent > 0 &&
1374 col < tok->indstack[tok->indent]) {
1375 tok->pendin--;
1376 tok->indent--;
1377 }
1378 if (col != tok->indstack[tok->indent]) {
1379 tok->done = E_DEDENT;
1380 tok->cur = tok->inp;
1381 return ERRORTOKEN;
1382 }
1383 if (altcol != tok->altindstack[tok->indent]) {
1384 if (indenterror(tok))
1385 return ERRORTOKEN;
1386 }
1387 }
1388 }
1389 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001392
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 /* Return pending indents/dedents */
1394 if (tok->pendin != 0) {
1395 if (tok->pendin < 0) {
1396 tok->pendin++;
1397 return DEDENT;
1398 }
1399 else {
1400 tok->pendin--;
1401 return INDENT;
1402 }
1403 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406 tok->start = NULL;
1407 /* Skip spaces */
1408 do {
1409 c = tok_nextc(tok);
1410 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 /* Set start of current token */
1413 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 /* Skip comment */
1416 if (c == '#')
1417 while (c != EOF && c != '\n')
1418 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001420 /* Check for EOF and errors now */
1421 if (c == EOF) {
1422 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1423 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 /* Identifier (most frequent token!) */
1426 nonascii = 0;
1427 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001428 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001429 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001430 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001431 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001432 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001433 /* Since this is a backwards compatibility support literal we don't
1434 want to support it in arbitrary order like byte literals. */
1435 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1436 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001437 /* ur"" and ru"" are not supported */
1438 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001439 saw_r = 1;
1440 else
1441 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 c = tok_nextc(tok);
1443 if (c == '"' || c == '\'')
1444 goto letter_quote;
1445 }
1446 while (is_potential_identifier_char(c)) {
1447 if (c >= 128)
1448 nonascii = 1;
1449 c = tok_nextc(tok);
1450 }
1451 tok_backup(tok, c);
1452 if (nonascii &&
1453 !verify_identifier(tok)) {
1454 tok->done = E_IDENTIFIER;
1455 return ERRORTOKEN;
1456 }
1457 *p_start = tok->start;
1458 *p_end = tok->cur;
1459 return NAME;
1460 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 /* Newline */
1463 if (c == '\n') {
1464 tok->atbol = 1;
1465 if (blankline || tok->level > 0)
1466 goto nextline;
1467 *p_start = tok->start;
1468 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1469 tok->cont_line = 0;
1470 return NEWLINE;
1471 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 /* Period or number starting with period? */
1474 if (c == '.') {
1475 c = tok_nextc(tok);
1476 if (isdigit(c)) {
1477 goto fraction;
1478 } else if (c == '.') {
1479 c = tok_nextc(tok);
1480 if (c == '.') {
1481 *p_start = tok->start;
1482 *p_end = tok->cur;
1483 return ELLIPSIS;
1484 } else {
1485 tok_backup(tok, c);
1486 }
1487 tok_backup(tok, '.');
1488 } else {
1489 tok_backup(tok, c);
1490 }
1491 *p_start = tok->start;
1492 *p_end = tok->cur;
1493 return DOT;
1494 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001495
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 /* Number */
1497 if (isdigit(c)) {
1498 if (c == '0') {
1499 /* Hex, octal or binary -- maybe. */
1500 c = tok_nextc(tok);
1501 if (c == '.')
1502 goto fraction;
1503 if (c == 'j' || c == 'J')
1504 goto imaginary;
1505 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001506
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 /* Hex */
1508 c = tok_nextc(tok);
1509 if (!isxdigit(c)) {
1510 tok->done = E_TOKEN;
1511 tok_backup(tok, c);
1512 return ERRORTOKEN;
1513 }
1514 do {
1515 c = tok_nextc(tok);
1516 } while (isxdigit(c));
1517 }
1518 else if (c == 'o' || c == 'O') {
1519 /* Octal */
1520 c = tok_nextc(tok);
1521 if (c < '0' || c >= '8') {
1522 tok->done = E_TOKEN;
1523 tok_backup(tok, c);
1524 return ERRORTOKEN;
1525 }
1526 do {
1527 c = tok_nextc(tok);
1528 } while ('0' <= c && c < '8');
1529 }
1530 else if (c == 'b' || c == 'B') {
1531 /* Binary */
1532 c = tok_nextc(tok);
1533 if (c != '0' && c != '1') {
1534 tok->done = E_TOKEN;
1535 tok_backup(tok, c);
1536 return ERRORTOKEN;
1537 }
1538 do {
1539 c = tok_nextc(tok);
1540 } while (c == '0' || c == '1');
1541 }
1542 else {
1543 int nonzero = 0;
1544 /* maybe old-style octal; c is first char of it */
1545 /* in any case, allow '0' as a literal */
1546 while (c == '0')
1547 c = tok_nextc(tok);
1548 while (isdigit(c)) {
1549 nonzero = 1;
1550 c = tok_nextc(tok);
1551 }
1552 if (c == '.')
1553 goto fraction;
1554 else if (c == 'e' || c == 'E')
1555 goto exponent;
1556 else if (c == 'j' || c == 'J')
1557 goto imaginary;
1558 else if (nonzero) {
1559 tok->done = E_TOKEN;
1560 tok_backup(tok, c);
1561 return ERRORTOKEN;
1562 }
1563 }
1564 }
1565 else {
1566 /* Decimal */
1567 do {
1568 c = tok_nextc(tok);
1569 } while (isdigit(c));
1570 {
1571 /* Accept floating point numbers. */
1572 if (c == '.') {
1573 fraction:
1574 /* Fraction */
1575 do {
1576 c = tok_nextc(tok);
1577 } while (isdigit(c));
1578 }
1579 if (c == 'e' || c == 'E') {
1580 exponent:
1581 /* Exponent part */
1582 c = tok_nextc(tok);
1583 if (c == '+' || c == '-')
1584 c = tok_nextc(tok);
1585 if (!isdigit(c)) {
1586 tok->done = E_TOKEN;
1587 tok_backup(tok, c);
1588 return ERRORTOKEN;
1589 }
1590 do {
1591 c = tok_nextc(tok);
1592 } while (isdigit(c));
1593 }
1594 if (c == 'j' || c == 'J')
1595 /* Imaginary part */
1596 imaginary:
1597 c = tok_nextc(tok);
1598 }
1599 }
1600 tok_backup(tok, c);
1601 *p_start = tok->start;
1602 *p_end = tok->cur;
1603 return NUMBER;
1604 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001605
1606 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 /* String */
1608 if (c == '\'' || c == '"') {
1609 int quote = c;
1610 int quote_size = 1; /* 1 or 3 */
1611 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001612
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 /* Find the quote size and start of string */
1614 c = tok_nextc(tok);
1615 if (c == quote) {
1616 c = tok_nextc(tok);
1617 if (c == quote)
1618 quote_size = 3;
1619 else
1620 end_quote_size = 1; /* empty string found */
1621 }
1622 if (c != quote)
1623 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001624
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 /* Get rest of string */
1626 while (end_quote_size != quote_size) {
1627 c = tok_nextc(tok);
1628 if (c == EOF) {
1629 if (quote_size == 3)
1630 tok->done = E_EOFS;
1631 else
1632 tok->done = E_EOLS;
1633 tok->cur = tok->inp;
1634 return ERRORTOKEN;
1635 }
1636 if (quote_size == 1 && c == '\n') {
1637 tok->done = E_EOLS;
1638 tok->cur = tok->inp;
1639 return ERRORTOKEN;
1640 }
1641 if (c == quote)
1642 end_quote_size += 1;
1643 else {
1644 end_quote_size = 0;
1645 if (c == '\\')
1646 c = tok_nextc(tok); /* skip escaped char */
1647 }
1648 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001649
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 *p_start = tok->start;
1651 *p_end = tok->cur;
1652 return STRING;
1653 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001654
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 /* Line continuation */
1656 if (c == '\\') {
1657 c = tok_nextc(tok);
1658 if (c != '\n') {
1659 tok->done = E_LINECONT;
1660 tok->cur = tok->inp;
1661 return ERRORTOKEN;
1662 }
1663 tok->cont_line = 1;
1664 goto again; /* Read next line */
1665 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001666
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001667 /* Check for two-character token */
1668 {
1669 int c2 = tok_nextc(tok);
1670 int token = PyToken_TwoChars(c, c2);
1671 if (token != OP) {
1672 int c3 = tok_nextc(tok);
1673 int token3 = PyToken_ThreeChars(c, c2, c3);
1674 if (token3 != OP) {
1675 token = token3;
1676 } else {
1677 tok_backup(tok, c3);
1678 }
1679 *p_start = tok->start;
1680 *p_end = tok->cur;
1681 return token;
1682 }
1683 tok_backup(tok, c2);
1684 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001685
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 /* Keep track of parentheses nesting level */
1687 switch (c) {
1688 case '(':
1689 case '[':
1690 case '{':
1691 tok->level++;
1692 break;
1693 case ')':
1694 case ']':
1695 case '}':
1696 tok->level--;
1697 break;
1698 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001699
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 /* Punctuation character */
1701 *p_start = tok->start;
1702 *p_end = tok->cur;
1703 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001704}
1705
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001706int
1707PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1708{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001709 int result = tok_get(tok, p_start, p_end);
1710 if (tok->decoding_erred) {
1711 result = ERRORTOKEN;
1712 tok->done = E_DECODE;
1713 }
1714 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001715}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001716
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001717/* Get the encoding of a Python file. Check for the coding cookie and check if
1718 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001719
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001720 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1721 encoding in the first or second line of the file (in which case the encoding
1722 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001723
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001724 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1725 by the caller. */
1726
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001727char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001728PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001729{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 struct tok_state *tok;
1731 FILE *fp;
1732 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001733
Victor Stinnerdaf45552013-08-28 00:53:59 +02001734#ifndef PGEN
1735 fd = _Py_dup(fd);
1736#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001738#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001739 if (fd < 0) {
1740 return NULL;
1741 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001742
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001743 fp = fdopen(fd, "r");
1744 if (fp == NULL) {
1745 return NULL;
1746 }
1747 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1748 if (tok == NULL) {
1749 fclose(fp);
1750 return NULL;
1751 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001752#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001753 if (filename != NULL) {
1754 Py_INCREF(filename);
1755 tok->filename = filename;
1756 }
1757 else {
1758 tok->filename = PyUnicode_FromString("<string>");
1759 if (tok->filename == NULL) {
1760 fclose(fp);
1761 PyTokenizer_Free(tok);
1762 return encoding;
1763 }
1764 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001765#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 while (tok->lineno < 2 && tok->done == E_OK) {
1767 PyTokenizer_Get(tok, &p_start, &p_end);
1768 }
1769 fclose(fp);
1770 if (tok->encoding) {
1771 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1772 if (encoding)
1773 strcpy(encoding, tok->encoding);
1774 }
1775 PyTokenizer_Free(tok);
1776 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001777}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001778
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001779char *
1780PyTokenizer_FindEncoding(int fd)
1781{
1782 return PyTokenizer_FindEncodingFilename(fd, NULL);
1783}
1784
Guido van Rossum408027e1996-12-30 16:17:54 +00001785#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001786
1787void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001788tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001789{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001790 printf("%s", _PyParser_TokenNames[type]);
1791 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1792 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001793}
1794
1795#endif