blob: 441d05a9bbc8cb09f2e43b1e9ee6708cda9f6e79 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
131 tok->filename = NULL;
132 tok->altwarning = 1;
133 tok->alterror = 1;
134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
136 tok->decoding_state = STATE_INIT;
137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
139 tok->enc = NULL;
140 tok->encoding = NULL;
141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Victor Stinner22a351a2010-10-14 12:04:34 +0000465 int fd;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 io = PyImport_ImportModuleNoBlock("io");
468 if (io == NULL)
469 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000470
Victor Stinner22a351a2010-10-14 12:04:34 +0000471 fd = fileno(tok->fp);
472 if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
473 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
474 goto cleanup;
475 }
476
477 stream = PyObject_CallMethod(io, "open", "isisOOO",
478 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 if (stream == NULL)
480 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 Py_XDECREF(tok->decoding_readline);
483 readline = PyObject_GetAttrString(stream, "readline");
484 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000485
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000486 /* The file has been reopened; parsing will restart from
487 * the beginning of the file, we have to reset the line number.
488 * But this function has been called from inside tok_nextc() which
489 * will increment lineno before it returns. So we set it -1 so that
490 * the next call to tok_nextc() will start with tok->lineno == 0.
491 */
492 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000493
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000494 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 Py_XDECREF(stream);
496 Py_XDECREF(io);
497 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498}
499
500/* Fetch the next byte from TOK. */
501
502static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504}
505
506/* Unfetch the last byte back into TOK. */
507
508static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510}
511
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000512/* Check whether the characters at s start a valid
513 UTF-8 sequence. Return the number of characters forming
514 the sequence if yes, 0 if not. */
515static int valid_utf8(const unsigned char* s)
516{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 int expected = 0;
518 int length;
519 if (*s < 0x80)
520 /* single-byte code */
521 return 1;
522 if (*s < 0xc0)
523 /* following byte */
524 return 0;
525 if (*s < 0xE0)
526 expected = 1;
527 else if (*s < 0xF0)
528 expected = 2;
529 else if (*s < 0xF8)
530 expected = 3;
531 else
532 return 0;
533 length = expected + 1;
534 for (; expected; expected--)
535 if (s[expected] < 0x80 || s[expected] >= 0xC0)
536 return 0;
537 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000538}
539
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540/* Read a line of input from TOK. Determine encoding
541 if necessary. */
542
543static char *
544decoding_fgets(char *s, int size, struct tok_state *tok)
545{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 char *line = NULL;
547 int badchar = 0;
Victor Stinner83098a42010-12-27 20:12:13 +0000548 PyObject *filename;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000549 for (;;) {
550 if (tok->decoding_state == STATE_NORMAL) {
551 /* We already have a codec associated with
552 this input. */
553 line = fp_readl(s, size, tok);
554 break;
555 } else if (tok->decoding_state == STATE_RAW) {
556 /* We want a 'raw' read. */
557 line = Py_UniversalNewlineFgets(s, size,
558 tok->fp, NULL);
559 break;
560 } else {
561 /* We have not yet determined the encoding.
562 If an encoding is found, use the file-pointer
563 reader functions from now on. */
564 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
565 return error_ret(tok);
566 assert(tok->decoding_state != STATE_INIT);
567 }
568 }
569 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
570 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
571 return error_ret(tok);
572 }
573 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 /* The default encoding is UTF-8, so make sure we don't have any
576 non-UTF-8 sequences in it. */
577 if (line && !tok->encoding) {
578 unsigned char *c;
579 int length;
580 for (c = (unsigned char *)line; *c; c += length)
581 if (!(length = valid_utf8(c))) {
582 badchar = *c;
583 break;
584 }
585 }
586 if (badchar) {
587 /* Need to add 1 to the line number, since this line
588 has not been counted, yet. */
Victor Stinner83098a42010-12-27 20:12:13 +0000589 filename = PyUnicode_DecodeFSDefault(tok->filename);
590 if (filename != NULL) {
591 PyErr_Format(PyExc_SyntaxError,
592 "Non-UTF-8 code starting with '\\x%.2x' "
593 "in file %.200U on line %i, "
594 "but no encoding declared; "
595 "see http://python.org/dev/peps/pep-0263/ for details",
596 badchar, filename, tok->lineno + 1);
597 Py_DECREF(filename);
598 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000599 return error_ret(tok);
600 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603}
604
605static int
606decoding_feof(struct tok_state *tok)
607{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 if (tok->decoding_state != STATE_NORMAL) {
609 return feof(tok->fp);
610 } else {
611 PyObject* buf = tok->decoding_buffer;
612 if (buf == NULL) {
613 buf = PyObject_CallObject(tok->decoding_readline, NULL);
614 if (buf == NULL) {
615 error_ret(tok);
616 return 1;
617 } else {
618 tok->decoding_buffer = buf;
619 }
620 }
621 return PyObject_Length(buf) == 0;
622 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623}
624
625/* Fetch a byte from TOK, using the string buffer. */
626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000627static int
628buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000629 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630}
631
632/* Unfetch a byte from TOK, using the string buffer. */
633
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000634static void
635buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 tok->str--;
637 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638}
639
640/* Set the readline function for TOK to ENC. For the string-based
641 tokenizer, this means to just record the encoding. */
642
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000643static int
644buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 tok->enc = enc;
646 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000647}
648
649/* Return a UTF-8 encoding Python string object from the
650 C byte string STR, which is encoded with ENC. */
651
652static PyObject *
653translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 PyObject *utf8;
655 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
656 if (buf == NULL)
657 return NULL;
658 utf8 = PyUnicode_AsUTF8String(buf);
659 Py_DECREF(buf);
660 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661}
662
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000663
664static char *
665translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
667 char *buf, *current;
668 char c = '\0';
669 buf = PyMem_MALLOC(needed_length);
670 if (buf == NULL) {
671 tok->done = E_NOMEM;
672 return NULL;
673 }
674 for (current = buf; *s; s++, current++) {
675 c = *s;
676 if (skip_next_lf) {
677 skip_next_lf = 0;
678 if (c == '\n') {
679 c = *++s;
680 if (!c)
681 break;
682 }
683 }
684 if (c == '\r') {
685 skip_next_lf = 1;
686 c = '\n';
687 }
688 *current = c;
689 }
690 /* If this is exec input, add a newline to the end of the string if
691 there isn't one already. */
692 if (exec_input && c != '\n') {
693 *current = '\n';
694 current++;
695 }
696 *current = '\0';
697 final_length = current - buf + 1;
698 if (final_length < needed_length && final_length)
699 /* should never fail */
700 buf = PyMem_REALLOC(buf, final_length);
701 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000702}
703
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000704/* Decode a byte string STR for use as the buffer of TOK.
705 Look for encoding declarations inside STR, and record them
706 inside TOK. */
707
708static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000709decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000710{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 PyObject* utf8 = NULL;
712 const char *str;
713 const char *s;
714 const char *newl[2] = {NULL, NULL};
715 int lineno = 0;
716 tok->input = str = translate_newlines(input, single, tok);
717 if (str == NULL)
718 return NULL;
719 tok->enc = NULL;
720 tok->str = str;
721 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
722 return error_ret(tok);
723 str = tok->str; /* string after BOM if any */
724 assert(str);
725 if (tok->enc != NULL) {
726 utf8 = translate_into_utf8(str, tok->enc);
727 if (utf8 == NULL)
728 return error_ret(tok);
729 str = PyBytes_AsString(utf8);
730 }
731 for (s = str;; s++) {
732 if (*s == '\0') break;
733 else if (*s == '\n') {
734 assert(lineno < 2);
735 newl[lineno] = s;
736 lineno++;
737 if (lineno == 2) break;
738 }
739 }
740 tok->enc = NULL;
741 /* need to check line 1 and 2 separately since check_coding_spec
742 assumes a single line as input */
743 if (newl[0]) {
744 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
745 return error_ret(tok);
746 if (tok->enc == NULL && newl[1]) {
747 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
748 tok, buf_setreadl))
749 return error_ret(tok);
750 }
751 }
752 if (tok->enc != NULL) {
753 assert(utf8 == NULL);
754 utf8 = translate_into_utf8(str, tok->enc);
755 if (utf8 == NULL)
756 return error_ret(tok);
757 str = PyBytes_AS_STRING(utf8);
758 }
759 assert(tok->decoding_buffer == NULL);
760 tok->decoding_buffer = utf8; /* CAUTION */
761 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000762}
763
764#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765
766/* Set up tokenizer for string */
767
768struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000769PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 struct tok_state *tok = tok_new();
772 if (tok == NULL)
773 return NULL;
774 str = (char *)decode_str(str, exec_input, tok);
775 if (str == NULL) {
776 PyTokenizer_Free(tok);
777 return NULL;
778 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000779
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 /* XXX: constify members. */
781 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
782 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783}
784
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000785struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000786PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000787{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 struct tok_state *tok = tok_new();
789 if (tok == NULL)
790 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000791#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000793#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 if (str == NULL) {
795 PyTokenizer_Free(tok);
796 return NULL;
797 }
798 tok->decoding_state = STATE_RAW;
799 tok->read_coding_spec = 1;
800 tok->enc = NULL;
801 tok->str = str;
802 tok->encoding = (char *)PyMem_MALLOC(6);
803 if (!tok->encoding) {
804 PyTokenizer_Free(tok);
805 return NULL;
806 }
807 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000808
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 /* XXX: constify members. */
810 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
811 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000812}
813
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000814/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000815
816struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000817PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 struct tok_state *tok = tok_new();
820 if (tok == NULL)
821 return NULL;
822 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
823 PyTokenizer_Free(tok);
824 return NULL;
825 }
826 tok->cur = tok->inp = tok->buf;
827 tok->end = tok->buf + BUFSIZ;
828 tok->fp = fp;
829 tok->prompt = ps1;
830 tok->nextprompt = ps2;
831 if (enc != NULL) {
832 /* Must copy encoding declaration since it
833 gets copied into the parse tree. */
834 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
835 if (!tok->encoding) {
836 PyTokenizer_Free(tok);
837 return NULL;
838 }
839 strcpy(tok->encoding, enc);
840 tok->decoding_state = STATE_NORMAL;
841 }
842 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843}
844
845
846/* Free a tok_state structure */
847
848void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000849PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000850{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000851 if (tok->encoding != NULL)
852 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000853#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 Py_XDECREF(tok->decoding_readline);
855 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000856#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 if (tok->fp != NULL && tok->buf != NULL)
858 PyMem_FREE(tok->buf);
859 if (tok->input)
860 PyMem_FREE((char *)tok->input);
861 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862}
863
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000864/* Get next char, updating state; error code goes into tok->done */
865
866static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000867tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869 for (;;) {
870 if (tok->cur != tok->inp) {
871 return Py_CHARMASK(*tok->cur++); /* Fast path */
872 }
873 if (tok->done != E_OK)
874 return EOF;
875 if (tok->fp == NULL) {
876 char *end = strchr(tok->inp, '\n');
877 if (end != NULL)
878 end++;
879 else {
880 end = strchr(tok->inp, '\0');
881 if (end == tok->inp) {
882 tok->done = E_EOF;
883 return EOF;
884 }
885 }
886 if (tok->start == NULL)
887 tok->buf = tok->cur;
888 tok->line_start = tok->cur;
889 tok->lineno++;
890 tok->inp = end;
891 return Py_CHARMASK(*tok->cur++);
892 }
893 if (tok->prompt != NULL) {
894 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000895#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 if (tok->encoding && newtok && *newtok) {
897 /* Recode to UTF-8 */
898 Py_ssize_t buflen;
899 const char* buf;
900 PyObject *u = translate_into_utf8(newtok, tok->encoding);
901 PyMem_FREE(newtok);
902 if (!u) {
903 tok->done = E_DECODE;
904 return EOF;
905 }
906 buflen = PyBytes_GET_SIZE(u);
907 buf = PyBytes_AS_STRING(u);
908 if (!buf) {
909 Py_DECREF(u);
910 tok->done = E_DECODE;
911 return EOF;
912 }
913 newtok = PyMem_MALLOC(buflen+1);
914 strcpy(newtok, buf);
915 Py_DECREF(u);
916 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000917#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000918 if (tok->nextprompt != NULL)
919 tok->prompt = tok->nextprompt;
920 if (newtok == NULL)
921 tok->done = E_INTR;
922 else if (*newtok == '\0') {
923 PyMem_FREE(newtok);
924 tok->done = E_EOF;
925 }
926 else if (tok->start != NULL) {
927 size_t start = tok->start - tok->buf;
928 size_t oldlen = tok->cur - tok->buf;
929 size_t newlen = oldlen + strlen(newtok);
930 char *buf = tok->buf;
931 buf = (char *)PyMem_REALLOC(buf, newlen+1);
932 tok->lineno++;
933 if (buf == NULL) {
934 PyMem_FREE(tok->buf);
935 tok->buf = NULL;
936 PyMem_FREE(newtok);
937 tok->done = E_NOMEM;
938 return EOF;
939 }
940 tok->buf = buf;
941 tok->cur = tok->buf + oldlen;
942 tok->line_start = tok->cur;
943 strcpy(tok->buf + oldlen, newtok);
944 PyMem_FREE(newtok);
945 tok->inp = tok->buf + newlen;
946 tok->end = tok->inp + 1;
947 tok->start = tok->buf + start;
948 }
949 else {
950 tok->lineno++;
951 if (tok->buf != NULL)
952 PyMem_FREE(tok->buf);
953 tok->buf = newtok;
954 tok->line_start = tok->buf;
955 tok->cur = tok->buf;
956 tok->line_start = tok->buf;
957 tok->inp = strchr(tok->buf, '\0');
958 tok->end = tok->inp + 1;
959 }
960 }
961 else {
962 int done = 0;
963 Py_ssize_t cur = 0;
964 char *pt;
965 if (tok->start == NULL) {
966 if (tok->buf == NULL) {
967 tok->buf = (char *)
968 PyMem_MALLOC(BUFSIZ);
969 if (tok->buf == NULL) {
970 tok->done = E_NOMEM;
971 return EOF;
972 }
973 tok->end = tok->buf + BUFSIZ;
974 }
975 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
976 tok) == NULL) {
977 tok->done = E_EOF;
978 done = 1;
979 }
980 else {
981 tok->done = E_OK;
982 tok->inp = strchr(tok->buf, '\0');
983 done = tok->inp[-1] == '\n';
984 }
985 }
986 else {
987 cur = tok->cur - tok->buf;
988 if (decoding_feof(tok)) {
989 tok->done = E_EOF;
990 done = 1;
991 }
992 else
993 tok->done = E_OK;
994 }
995 tok->lineno++;
996 /* Read until '\n' or EOF */
997 while (!done) {
998 Py_ssize_t curstart = tok->start == NULL ? -1 :
999 tok->start - tok->buf;
1000 Py_ssize_t curvalid = tok->inp - tok->buf;
1001 Py_ssize_t newsize = curvalid + BUFSIZ;
1002 char *newbuf = tok->buf;
1003 newbuf = (char *)PyMem_REALLOC(newbuf,
1004 newsize);
1005 if (newbuf == NULL) {
1006 tok->done = E_NOMEM;
1007 tok->cur = tok->inp;
1008 return EOF;
1009 }
1010 tok->buf = newbuf;
1011 tok->inp = tok->buf + curvalid;
1012 tok->end = tok->buf + newsize;
1013 tok->start = curstart < 0 ? NULL :
1014 tok->buf + curstart;
1015 if (decoding_fgets(tok->inp,
1016 (int)(tok->end - tok->inp),
1017 tok) == NULL) {
1018 /* Break out early on decoding
1019 errors, as tok->buf will be NULL
1020 */
1021 if (tok->decoding_erred)
1022 return EOF;
1023 /* Last line does not end in \n,
1024 fake one */
1025 strcpy(tok->inp, "\n");
1026 }
1027 tok->inp = strchr(tok->inp, '\0');
1028 done = tok->inp[-1] == '\n';
1029 }
1030 if (tok->buf != NULL) {
1031 tok->cur = tok->buf + cur;
1032 tok->line_start = tok->cur;
1033 /* replace "\r\n" with "\n" */
1034 /* For Mac leave the \r, giving a syntax error */
1035 pt = tok->inp - 2;
1036 if (pt >= tok->buf && *pt == '\r') {
1037 *pt++ = '\n';
1038 *pt = '\0';
1039 tok->inp = pt;
1040 }
1041 }
1042 }
1043 if (tok->done != E_OK) {
1044 if (tok->prompt != NULL)
1045 PySys_WriteStderr("\n");
1046 tok->cur = tok->inp;
1047 return EOF;
1048 }
1049 }
1050 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001051}
1052
1053
1054/* Back-up one character */
1055
1056static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001057tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001058{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 if (c != EOF) {
1060 if (--tok->cur < tok->buf)
1061 Py_FatalError("tok_backup: beginning of buffer");
1062 if (*tok->cur != c)
1063 *tok->cur = c;
1064 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001065}
1066
1067
1068/* Return the token corresponding to a single character */
1069
1070int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001071PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001072{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 switch (c) {
1074 case '(': return LPAR;
1075 case ')': return RPAR;
1076 case '[': return LSQB;
1077 case ']': return RSQB;
1078 case ':': return COLON;
1079 case ',': return COMMA;
1080 case ';': return SEMI;
1081 case '+': return PLUS;
1082 case '-': return MINUS;
1083 case '*': return STAR;
1084 case '/': return SLASH;
1085 case '|': return VBAR;
1086 case '&': return AMPER;
1087 case '<': return LESS;
1088 case '>': return GREATER;
1089 case '=': return EQUAL;
1090 case '.': return DOT;
1091 case '%': return PERCENT;
1092 case '{': return LBRACE;
1093 case '}': return RBRACE;
1094 case '^': return CIRCUMFLEX;
1095 case '~': return TILDE;
1096 case '@': return AT;
1097 default: return OP;
1098 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001099}
1100
1101
Guido van Rossumfbab9051991-10-20 20:25:03 +00001102int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001103PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001104{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 switch (c1) {
1106 case '=':
1107 switch (c2) {
1108 case '=': return EQEQUAL;
1109 }
1110 break;
1111 case '!':
1112 switch (c2) {
1113 case '=': return NOTEQUAL;
1114 }
1115 break;
1116 case '<':
1117 switch (c2) {
1118 case '>': return NOTEQUAL;
1119 case '=': return LESSEQUAL;
1120 case '<': return LEFTSHIFT;
1121 }
1122 break;
1123 case '>':
1124 switch (c2) {
1125 case '=': return GREATEREQUAL;
1126 case '>': return RIGHTSHIFT;
1127 }
1128 break;
1129 case '+':
1130 switch (c2) {
1131 case '=': return PLUSEQUAL;
1132 }
1133 break;
1134 case '-':
1135 switch (c2) {
1136 case '=': return MINEQUAL;
1137 case '>': return RARROW;
1138 }
1139 break;
1140 case '*':
1141 switch (c2) {
1142 case '*': return DOUBLESTAR;
1143 case '=': return STAREQUAL;
1144 }
1145 break;
1146 case '/':
1147 switch (c2) {
1148 case '/': return DOUBLESLASH;
1149 case '=': return SLASHEQUAL;
1150 }
1151 break;
1152 case '|':
1153 switch (c2) {
1154 case '=': return VBAREQUAL;
1155 }
1156 break;
1157 case '%':
1158 switch (c2) {
1159 case '=': return PERCENTEQUAL;
1160 }
1161 break;
1162 case '&':
1163 switch (c2) {
1164 case '=': return AMPEREQUAL;
1165 }
1166 break;
1167 case '^':
1168 switch (c2) {
1169 case '=': return CIRCUMFLEXEQUAL;
1170 }
1171 break;
1172 }
1173 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001174}
1175
Thomas Wouters434d0822000-08-24 20:11:32 +00001176int
1177PyToken_ThreeChars(int c1, int c2, int c3)
1178{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 switch (c1) {
1180 case '<':
1181 switch (c2) {
1182 case '<':
1183 switch (c3) {
1184 case '=':
1185 return LEFTSHIFTEQUAL;
1186 }
1187 break;
1188 }
1189 break;
1190 case '>':
1191 switch (c2) {
1192 case '>':
1193 switch (c3) {
1194 case '=':
1195 return RIGHTSHIFTEQUAL;
1196 }
1197 break;
1198 }
1199 break;
1200 case '*':
1201 switch (c2) {
1202 case '*':
1203 switch (c3) {
1204 case '=':
1205 return DOUBLESTAREQUAL;
1206 }
1207 break;
1208 }
1209 break;
1210 case '/':
1211 switch (c2) {
1212 case '/':
1213 switch (c3) {
1214 case '=':
1215 return DOUBLESLASHEQUAL;
1216 }
1217 break;
1218 }
1219 break;
1220 case '.':
1221 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001222 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 switch (c3) {
1224 case '.':
1225 return ELLIPSIS;
1226 }
1227 break;
1228 }
1229 break;
1230 }
1231 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001232}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001233
Guido van Rossum926f13a1998-04-09 21:38:06 +00001234static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001235indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001236{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001237 if (tok->alterror) {
1238 tok->done = E_TABSPACE;
1239 tok->cur = tok->inp;
1240 return 1;
1241 }
1242 if (tok->altwarning) {
1243 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1244 "in indentation\n", tok->filename);
1245 tok->altwarning = 0;
1246 }
1247 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001248}
1249
Martin v. Löwis47383402007-08-15 07:32:56 +00001250#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001251#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001252#else
1253/* Verify that the identifier follows PEP 3131. */
1254static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001255verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001256{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 PyObject *s;
1258 int result;
1259 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1260 if (s == NULL) {
1261 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1262 PyErr_Clear();
1263 tok->done = E_IDENTIFIER;
1264 } else {
1265 tok->done = E_ERROR;
1266 }
1267 return 0;
1268 }
1269 result = PyUnicode_IsIdentifier(s);
1270 Py_DECREF(s);
1271 if (result == 0)
1272 tok->done = E_IDENTIFIER;
1273 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001274}
1275#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001276
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277/* Get next token, after space stripping etc. */
1278
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001279static int
1280tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 register int c;
1283 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001284
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001286 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 tok->start = NULL;
1288 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001289
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 /* Get indentation level */
1291 if (tok->atbol) {
1292 register int col = 0;
1293 register int altcol = 0;
1294 tok->atbol = 0;
1295 for (;;) {
1296 c = tok_nextc(tok);
1297 if (c == ' ')
1298 col++, altcol++;
1299 else if (c == '\t') {
1300 col = (col/tok->tabsize + 1) * tok->tabsize;
1301 altcol = (altcol/tok->alttabsize + 1)
1302 * tok->alttabsize;
1303 }
1304 else if (c == '\014') /* Control-L (formfeed) */
1305 col = altcol = 0; /* For Emacs users */
1306 else
1307 break;
1308 }
1309 tok_backup(tok, c);
1310 if (c == '#' || c == '\n') {
1311 /* Lines with only whitespace and/or comments
1312 shouldn't affect the indentation and are
1313 not passed to the parser as NEWLINE tokens,
1314 except *totally* empty lines in interactive
1315 mode, which signal the end of a command group. */
1316 if (col == 0 && c == '\n' && tok->prompt != NULL)
1317 blankline = 0; /* Let it through */
1318 else
1319 blankline = 1; /* Ignore completely */
1320 /* We can't jump back right here since we still
1321 may need to skip to the end of a comment */
1322 }
1323 if (!blankline && tok->level == 0) {
1324 if (col == tok->indstack[tok->indent]) {
1325 /* No change */
1326 if (altcol != tok->altindstack[tok->indent]) {
1327 if (indenterror(tok))
1328 return ERRORTOKEN;
1329 }
1330 }
1331 else if (col > tok->indstack[tok->indent]) {
1332 /* Indent -- always one */
1333 if (tok->indent+1 >= MAXINDENT) {
1334 tok->done = E_TOODEEP;
1335 tok->cur = tok->inp;
1336 return ERRORTOKEN;
1337 }
1338 if (altcol <= tok->altindstack[tok->indent]) {
1339 if (indenterror(tok))
1340 return ERRORTOKEN;
1341 }
1342 tok->pendin++;
1343 tok->indstack[++tok->indent] = col;
1344 tok->altindstack[tok->indent] = altcol;
1345 }
1346 else /* col < tok->indstack[tok->indent] */ {
1347 /* Dedent -- any number, must be consistent */
1348 while (tok->indent > 0 &&
1349 col < tok->indstack[tok->indent]) {
1350 tok->pendin--;
1351 tok->indent--;
1352 }
1353 if (col != tok->indstack[tok->indent]) {
1354 tok->done = E_DEDENT;
1355 tok->cur = tok->inp;
1356 return ERRORTOKEN;
1357 }
1358 if (altcol != tok->altindstack[tok->indent]) {
1359 if (indenterror(tok))
1360 return ERRORTOKEN;
1361 }
1362 }
1363 }
1364 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 /* Return pending indents/dedents */
1369 if (tok->pendin != 0) {
1370 if (tok->pendin < 0) {
1371 tok->pendin++;
1372 return DEDENT;
1373 }
1374 else {
1375 tok->pendin--;
1376 return INDENT;
1377 }
1378 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 tok->start = NULL;
1382 /* Skip spaces */
1383 do {
1384 c = tok_nextc(tok);
1385 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 /* Set start of current token */
1388 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 /* Skip comment */
1391 if (c == '#')
1392 while (c != EOF && c != '\n')
1393 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001394
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 /* Check for EOF and errors now */
1396 if (c == EOF) {
1397 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1398 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001399
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 /* Identifier (most frequent token!) */
1401 nonascii = 0;
1402 if (is_potential_identifier_start(c)) {
1403 /* Process b"", r"" and br"" */
1404 if (c == 'b' || c == 'B') {
1405 c = tok_nextc(tok);
1406 if (c == '"' || c == '\'')
1407 goto letter_quote;
1408 }
1409 if (c == 'r' || c == 'R') {
1410 c = tok_nextc(tok);
1411 if (c == '"' || c == '\'')
1412 goto letter_quote;
1413 }
1414 while (is_potential_identifier_char(c)) {
1415 if (c >= 128)
1416 nonascii = 1;
1417 c = tok_nextc(tok);
1418 }
1419 tok_backup(tok, c);
1420 if (nonascii &&
1421 !verify_identifier(tok)) {
1422 tok->done = E_IDENTIFIER;
1423 return ERRORTOKEN;
1424 }
1425 *p_start = tok->start;
1426 *p_end = tok->cur;
1427 return NAME;
1428 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 /* Newline */
1431 if (c == '\n') {
1432 tok->atbol = 1;
1433 if (blankline || tok->level > 0)
1434 goto nextline;
1435 *p_start = tok->start;
1436 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1437 tok->cont_line = 0;
1438 return NEWLINE;
1439 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 /* Period or number starting with period? */
1442 if (c == '.') {
1443 c = tok_nextc(tok);
1444 if (isdigit(c)) {
1445 goto fraction;
1446 } else if (c == '.') {
1447 c = tok_nextc(tok);
1448 if (c == '.') {
1449 *p_start = tok->start;
1450 *p_end = tok->cur;
1451 return ELLIPSIS;
1452 } else {
1453 tok_backup(tok, c);
1454 }
1455 tok_backup(tok, '.');
1456 } else {
1457 tok_backup(tok, c);
1458 }
1459 *p_start = tok->start;
1460 *p_end = tok->cur;
1461 return DOT;
1462 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 /* Number */
1465 if (isdigit(c)) {
1466 if (c == '0') {
1467 /* Hex, octal or binary -- maybe. */
1468 c = tok_nextc(tok);
1469 if (c == '.')
1470 goto fraction;
1471 if (c == 'j' || c == 'J')
1472 goto imaginary;
1473 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001474
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001475 /* Hex */
1476 c = tok_nextc(tok);
1477 if (!isxdigit(c)) {
1478 tok->done = E_TOKEN;
1479 tok_backup(tok, c);
1480 return ERRORTOKEN;
1481 }
1482 do {
1483 c = tok_nextc(tok);
1484 } while (isxdigit(c));
1485 }
1486 else if (c == 'o' || c == 'O') {
1487 /* Octal */
1488 c = tok_nextc(tok);
1489 if (c < '0' || c >= '8') {
1490 tok->done = E_TOKEN;
1491 tok_backup(tok, c);
1492 return ERRORTOKEN;
1493 }
1494 do {
1495 c = tok_nextc(tok);
1496 } while ('0' <= c && c < '8');
1497 }
1498 else if (c == 'b' || c == 'B') {
1499 /* Binary */
1500 c = tok_nextc(tok);
1501 if (c != '0' && c != '1') {
1502 tok->done = E_TOKEN;
1503 tok_backup(tok, c);
1504 return ERRORTOKEN;
1505 }
1506 do {
1507 c = tok_nextc(tok);
1508 } while (c == '0' || c == '1');
1509 }
1510 else {
1511 int nonzero = 0;
1512 /* maybe old-style octal; c is first char of it */
1513 /* in any case, allow '0' as a literal */
1514 while (c == '0')
1515 c = tok_nextc(tok);
1516 while (isdigit(c)) {
1517 nonzero = 1;
1518 c = tok_nextc(tok);
1519 }
1520 if (c == '.')
1521 goto fraction;
1522 else if (c == 'e' || c == 'E')
1523 goto exponent;
1524 else if (c == 'j' || c == 'J')
1525 goto imaginary;
1526 else if (nonzero) {
1527 tok->done = E_TOKEN;
1528 tok_backup(tok, c);
1529 return ERRORTOKEN;
1530 }
1531 }
1532 }
1533 else {
1534 /* Decimal */
1535 do {
1536 c = tok_nextc(tok);
1537 } while (isdigit(c));
1538 {
1539 /* Accept floating point numbers. */
1540 if (c == '.') {
1541 fraction:
1542 /* Fraction */
1543 do {
1544 c = tok_nextc(tok);
1545 } while (isdigit(c));
1546 }
1547 if (c == 'e' || c == 'E') {
1548 exponent:
1549 /* Exponent part */
1550 c = tok_nextc(tok);
1551 if (c == '+' || c == '-')
1552 c = tok_nextc(tok);
1553 if (!isdigit(c)) {
1554 tok->done = E_TOKEN;
1555 tok_backup(tok, c);
1556 return ERRORTOKEN;
1557 }
1558 do {
1559 c = tok_nextc(tok);
1560 } while (isdigit(c));
1561 }
1562 if (c == 'j' || c == 'J')
1563 /* Imaginary part */
1564 imaginary:
1565 c = tok_nextc(tok);
1566 }
1567 }
1568 tok_backup(tok, c);
1569 *p_start = tok->start;
1570 *p_end = tok->cur;
1571 return NUMBER;
1572 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001573
1574 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 /* String */
1576 if (c == '\'' || c == '"') {
1577 int quote = c;
1578 int quote_size = 1; /* 1 or 3 */
1579 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001580
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001581 /* Find the quote size and start of string */
1582 c = tok_nextc(tok);
1583 if (c == quote) {
1584 c = tok_nextc(tok);
1585 if (c == quote)
1586 quote_size = 3;
1587 else
1588 end_quote_size = 1; /* empty string found */
1589 }
1590 if (c != quote)
1591 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001592
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 /* Get rest of string */
1594 while (end_quote_size != quote_size) {
1595 c = tok_nextc(tok);
1596 if (c == EOF) {
1597 if (quote_size == 3)
1598 tok->done = E_EOFS;
1599 else
1600 tok->done = E_EOLS;
1601 tok->cur = tok->inp;
1602 return ERRORTOKEN;
1603 }
1604 if (quote_size == 1 && c == '\n') {
1605 tok->done = E_EOLS;
1606 tok->cur = tok->inp;
1607 return ERRORTOKEN;
1608 }
1609 if (c == quote)
1610 end_quote_size += 1;
1611 else {
1612 end_quote_size = 0;
1613 if (c == '\\')
1614 c = tok_nextc(tok); /* skip escaped char */
1615 }
1616 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001617
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 *p_start = tok->start;
1619 *p_end = tok->cur;
1620 return STRING;
1621 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001622
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 /* Line continuation */
1624 if (c == '\\') {
1625 c = tok_nextc(tok);
1626 if (c != '\n') {
1627 tok->done = E_LINECONT;
1628 tok->cur = tok->inp;
1629 return ERRORTOKEN;
1630 }
1631 tok->cont_line = 1;
1632 goto again; /* Read next line */
1633 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001634
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 /* Check for two-character token */
1636 {
1637 int c2 = tok_nextc(tok);
1638 int token = PyToken_TwoChars(c, c2);
1639 if (token != OP) {
1640 int c3 = tok_nextc(tok);
1641 int token3 = PyToken_ThreeChars(c, c2, c3);
1642 if (token3 != OP) {
1643 token = token3;
1644 } else {
1645 tok_backup(tok, c3);
1646 }
1647 *p_start = tok->start;
1648 *p_end = tok->cur;
1649 return token;
1650 }
1651 tok_backup(tok, c2);
1652 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 /* Keep track of parentheses nesting level */
1655 switch (c) {
1656 case '(':
1657 case '[':
1658 case '{':
1659 tok->level++;
1660 break;
1661 case ')':
1662 case ']':
1663 case '}':
1664 tok->level--;
1665 break;
1666 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001667
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 /* Punctuation character */
1669 *p_start = tok->start;
1670 *p_end = tok->cur;
1671 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001672}
1673
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001674int
1675PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1676{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 int result = tok_get(tok, p_start, p_end);
1678 if (tok->decoding_erred) {
1679 result = ERRORTOKEN;
1680 tok->done = E_DECODE;
1681 }
1682 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001683}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001684
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001685/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001686
1687 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001688 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001689 should be assumed to be PyUnicode_GetDefaultEncoding()).
1690
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001691 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1692 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001693*/
1694char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001695PyTokenizer_FindEncoding(int fd)
1696{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 struct tok_state *tok;
1698 FILE *fp;
1699 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001700
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 fd = dup(fd);
1702 if (fd < 0) {
1703 return NULL;
1704 }
1705 fp = fdopen(fd, "r");
1706 if (fp == NULL) {
1707 return NULL;
1708 }
1709 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1710 if (tok == NULL) {
1711 fclose(fp);
1712 return NULL;
1713 }
1714 while (tok->lineno < 2 && tok->done == E_OK) {
1715 PyTokenizer_Get(tok, &p_start, &p_end);
1716 }
1717 fclose(fp);
1718 if (tok->encoding) {
1719 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1720 if (encoding)
1721 strcpy(encoding, tok->encoding);
1722 }
1723 PyTokenizer_Free(tok);
1724 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001725}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001726
Guido van Rossum408027e1996-12-30 16:17:54 +00001727#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001728
1729void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001730tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001731{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 printf("%s", _PyParser_TokenNames[type]);
1733 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1734 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001735}
1736
1737#endif