blob: 90b1b689b5452c667f4a2249793f39f54cfe1afc [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
131 tok->filename = NULL;
132 tok->altwarning = 1;
133 tok->alterror = 1;
134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
136 tok->decoding_state = STATE_INIT;
137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
139 tok->enc = NULL;
140 tok->encoding = NULL;
141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 PyMem_FREE(tok->buf);
188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_ssize_t i;
228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 begin = t;
248 while (Py_ISALNUM(t[0]) ||
249 t[0] == '-' || t[0] == '_' || t[0] == '.')
250 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
256 PyMem_FREE(r);
257 r = new_string(q, strlen(q));
258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000273 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 char * cs;
276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
281 cs = get_coding_spec(line, size);
282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
285 assert(tok->decoding_state == STATE_RAW);
286 if (strcmp(cs, "utf-8") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = STATE_NORMAL;
293 }
294 else
295 PyMem_FREE(cs);
296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
299 PyMem_FREE(cs);
300 }
301 }
302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
308 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
323 tok->decoding_state = STATE_RAW;
324 if (ch1 == EOF) {
325 return 1;
326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
352 tok->decoding_state = STATE_NORMAL;
353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 } else {
365 unget_char(ch1, tok);
366 return 1;
367 }
368 if (tok->encoding != NULL)
369 PyMem_FREE(tok->encoding);
370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 /* No need to set_readline: input is already utf-8 */
372 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
386 until the buffer ends with a '\n' (or until the end of the file is
387 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 PyObject* bufobj;
394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
409 goto error;
410 }
411 if (PyUnicode_CheckExact(bufobj))
412 {
413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 if (buf == NULL) {
415 goto error;
416 }
417 }
418 else
419 {
420 buf = PyByteArray_AsString(bufobj);
421 if (buf == NULL) {
422 goto error;
423 }
424 buflen = PyByteArray_GET_SIZE(bufobj);
425 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 Py_XDECREF(tok->decoding_buffer);
428 if (buflen > size) {
429 /* Too many chars, the rest goes into tok->decoding_buffer */
430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 buflen-size);
432 if (tok->decoding_buffer == NULL)
433 goto error;
434 buflen = size;
435 }
436 else
437 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
441 if (buflen == 0) /* EOF */
442 s = NULL;
443 Py_DECREF(bufobj);
444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447 Py_XDECREF(bufobj);
448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000466 io = PyImport_ImportModuleNoBlock("io");
467 if (io == NULL)
468 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000469
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 if (tok->filename)
471 stream = PyObject_CallMethod(io, "open", "ssis",
472 tok->filename, "r", -1, enc);
473 else
474 stream = PyObject_CallMethod(io, "open", "isisOOO",
475 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
476 if (stream == NULL)
477 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 Py_XDECREF(tok->decoding_readline);
480 readline = PyObject_GetAttrString(stream, "readline");
481 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000482
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000483 /* The file has been reopened; parsing will restart from
484 * the beginning of the file, we have to reset the line number.
485 * But this function has been called from inside tok_nextc() which
486 * will increment lineno before it returns. So we set it -1 so that
487 * the next call to tok_nextc() will start with tok->lineno == 0.
488 */
489 tok->lineno = -1;
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000490
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000491 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492 Py_XDECREF(stream);
493 Py_XDECREF(io);
494 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495}
496
497/* Fetch the next byte from TOK. */
498
499static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000500 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000501}
502
503/* Unfetch the last byte back into TOK. */
504
505static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507}
508
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000509/* Check whether the characters at s start a valid
510 UTF-8 sequence. Return the number of characters forming
511 the sequence if yes, 0 if not. */
512static int valid_utf8(const unsigned char* s)
513{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 int expected = 0;
515 int length;
516 if (*s < 0x80)
517 /* single-byte code */
518 return 1;
519 if (*s < 0xc0)
520 /* following byte */
521 return 0;
522 if (*s < 0xE0)
523 expected = 1;
524 else if (*s < 0xF0)
525 expected = 2;
526 else if (*s < 0xF8)
527 expected = 3;
528 else
529 return 0;
530 length = expected + 1;
531 for (; expected; expected--)
532 if (s[expected] < 0x80 || s[expected] >= 0xC0)
533 return 0;
534 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000535}
536
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537/* Read a line of input from TOK. Determine encoding
538 if necessary. */
539
540static char *
541decoding_fgets(char *s, int size, struct tok_state *tok)
542{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000543 char *line = NULL;
544 int badchar = 0;
545 for (;;) {
546 if (tok->decoding_state == STATE_NORMAL) {
547 /* We already have a codec associated with
548 this input. */
549 line = fp_readl(s, size, tok);
550 break;
551 } else if (tok->decoding_state == STATE_RAW) {
552 /* We want a 'raw' read. */
553 line = Py_UniversalNewlineFgets(s, size,
554 tok->fp, NULL);
555 break;
556 } else {
557 /* We have not yet determined the encoding.
558 If an encoding is found, use the file-pointer
559 reader functions from now on. */
560 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
561 return error_ret(tok);
562 assert(tok->decoding_state != STATE_INIT);
563 }
564 }
565 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
566 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
567 return error_ret(tok);
568 }
569 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000570#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 /* The default encoding is UTF-8, so make sure we don't have any
572 non-UTF-8 sequences in it. */
573 if (line && !tok->encoding) {
574 unsigned char *c;
575 int length;
576 for (c = (unsigned char *)line; *c; c += length)
577 if (!(length = valid_utf8(c))) {
578 badchar = *c;
579 break;
580 }
581 }
582 if (badchar) {
583 /* Need to add 1 to the line number, since this line
584 has not been counted, yet. */
585 PyErr_Format(PyExc_SyntaxError,
586 "Non-UTF-8 code starting with '\\x%.2x' "
587 "in file %.200s on line %i, "
588 "but no encoding declared; "
589 "see http://python.org/dev/peps/pep-0263/ for details",
590 badchar, tok->filename, tok->lineno + 1);
591 return error_ret(tok);
592 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000594 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595}
596
597static int
598decoding_feof(struct tok_state *tok)
599{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000600 if (tok->decoding_state != STATE_NORMAL) {
601 return feof(tok->fp);
602 } else {
603 PyObject* buf = tok->decoding_buffer;
604 if (buf == NULL) {
605 buf = PyObject_CallObject(tok->decoding_readline, NULL);
606 if (buf == NULL) {
607 error_ret(tok);
608 return 1;
609 } else {
610 tok->decoding_buffer = buf;
611 }
612 }
613 return PyObject_Length(buf) == 0;
614 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615}
616
617/* Fetch a byte from TOK, using the string buffer. */
618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000619static int
620buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622}
623
624/* Unfetch a byte from TOK, using the string buffer. */
625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000626static void
627buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 tok->str--;
629 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630}
631
632/* Set the readline function for TOK to ENC. For the string-based
633 tokenizer, this means to just record the encoding. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 tok->enc = enc;
638 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639}
640
641/* Return a UTF-8 encoding Python string object from the
642 C byte string STR, which is encoded with ENC. */
643
644static PyObject *
645translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 PyObject *utf8;
647 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
648 if (buf == NULL)
649 return NULL;
650 utf8 = PyUnicode_AsUTF8String(buf);
651 Py_DECREF(buf);
652 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653}
654
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000655
656static char *
657translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
659 char *buf, *current;
660 char c = '\0';
661 buf = PyMem_MALLOC(needed_length);
662 if (buf == NULL) {
663 tok->done = E_NOMEM;
664 return NULL;
665 }
666 for (current = buf; *s; s++, current++) {
667 c = *s;
668 if (skip_next_lf) {
669 skip_next_lf = 0;
670 if (c == '\n') {
671 c = *++s;
672 if (!c)
673 break;
674 }
675 }
676 if (c == '\r') {
677 skip_next_lf = 1;
678 c = '\n';
679 }
680 *current = c;
681 }
682 /* If this is exec input, add a newline to the end of the string if
683 there isn't one already. */
684 if (exec_input && c != '\n') {
685 *current = '\n';
686 current++;
687 }
688 *current = '\0';
689 final_length = current - buf + 1;
690 if (final_length < needed_length && final_length)
691 /* should never fail */
692 buf = PyMem_REALLOC(buf, final_length);
693 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000694}
695
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000696/* Decode a byte string STR for use as the buffer of TOK.
697 Look for encoding declarations inside STR, and record them
698 inside TOK. */
699
700static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000701decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000703 PyObject* utf8 = NULL;
704 const char *str;
705 const char *s;
706 const char *newl[2] = {NULL, NULL};
707 int lineno = 0;
708 tok->input = str = translate_newlines(input, single, tok);
709 if (str == NULL)
710 return NULL;
711 tok->enc = NULL;
712 tok->str = str;
713 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
714 return error_ret(tok);
715 str = tok->str; /* string after BOM if any */
716 assert(str);
717 if (tok->enc != NULL) {
718 utf8 = translate_into_utf8(str, tok->enc);
719 if (utf8 == NULL)
720 return error_ret(tok);
721 str = PyBytes_AsString(utf8);
722 }
723 for (s = str;; s++) {
724 if (*s == '\0') break;
725 else if (*s == '\n') {
726 assert(lineno < 2);
727 newl[lineno] = s;
728 lineno++;
729 if (lineno == 2) break;
730 }
731 }
732 tok->enc = NULL;
733 /* need to check line 1 and 2 separately since check_coding_spec
734 assumes a single line as input */
735 if (newl[0]) {
736 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
737 return error_ret(tok);
738 if (tok->enc == NULL && newl[1]) {
739 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
740 tok, buf_setreadl))
741 return error_ret(tok);
742 }
743 }
744 if (tok->enc != NULL) {
745 assert(utf8 == NULL);
746 utf8 = translate_into_utf8(str, tok->enc);
747 if (utf8 == NULL)
748 return error_ret(tok);
749 str = PyBytes_AS_STRING(utf8);
750 }
751 assert(tok->decoding_buffer == NULL);
752 tok->decoding_buffer = utf8; /* CAUTION */
753 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000754}
755
756#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757
758/* Set up tokenizer for string */
759
760struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000761PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 struct tok_state *tok = tok_new();
764 if (tok == NULL)
765 return NULL;
766 str = (char *)decode_str(str, exec_input, tok);
767 if (str == NULL) {
768 PyTokenizer_Free(tok);
769 return NULL;
770 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000771
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 /* XXX: constify members. */
773 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
774 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000775}
776
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000777struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000778PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000779{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 struct tok_state *tok = tok_new();
781 if (tok == NULL)
782 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000783#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000785#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 if (str == NULL) {
787 PyTokenizer_Free(tok);
788 return NULL;
789 }
790 tok->decoding_state = STATE_RAW;
791 tok->read_coding_spec = 1;
792 tok->enc = NULL;
793 tok->str = str;
794 tok->encoding = (char *)PyMem_MALLOC(6);
795 if (!tok->encoding) {
796 PyTokenizer_Free(tok);
797 return NULL;
798 }
799 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000800
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 /* XXX: constify members. */
802 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
803 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000804}
805
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000806/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807
808struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000809PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 struct tok_state *tok = tok_new();
812 if (tok == NULL)
813 return NULL;
814 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
815 PyTokenizer_Free(tok);
816 return NULL;
817 }
818 tok->cur = tok->inp = tok->buf;
819 tok->end = tok->buf + BUFSIZ;
820 tok->fp = fp;
821 tok->prompt = ps1;
822 tok->nextprompt = ps2;
823 if (enc != NULL) {
824 /* Must copy encoding declaration since it
825 gets copied into the parse tree. */
826 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
827 if (!tok->encoding) {
828 PyTokenizer_Free(tok);
829 return NULL;
830 }
831 strcpy(tok->encoding, enc);
832 tok->decoding_state = STATE_NORMAL;
833 }
834 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835}
836
837
838/* Free a tok_state structure */
839
840void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000841PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 if (tok->encoding != NULL)
844 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000845#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 Py_XDECREF(tok->decoding_readline);
847 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000848#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 if (tok->fp != NULL && tok->buf != NULL)
850 PyMem_FREE(tok->buf);
851 if (tok->input)
852 PyMem_FREE((char *)tok->input);
853 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854}
855
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856/* Get next char, updating state; error code goes into tok->done */
857
858static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000859tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 for (;;) {
862 if (tok->cur != tok->inp) {
863 return Py_CHARMASK(*tok->cur++); /* Fast path */
864 }
865 if (tok->done != E_OK)
866 return EOF;
867 if (tok->fp == NULL) {
868 char *end = strchr(tok->inp, '\n');
869 if (end != NULL)
870 end++;
871 else {
872 end = strchr(tok->inp, '\0');
873 if (end == tok->inp) {
874 tok->done = E_EOF;
875 return EOF;
876 }
877 }
878 if (tok->start == NULL)
879 tok->buf = tok->cur;
880 tok->line_start = tok->cur;
881 tok->lineno++;
882 tok->inp = end;
883 return Py_CHARMASK(*tok->cur++);
884 }
885 if (tok->prompt != NULL) {
886 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000887#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 if (tok->encoding && newtok && *newtok) {
889 /* Recode to UTF-8 */
890 Py_ssize_t buflen;
891 const char* buf;
892 PyObject *u = translate_into_utf8(newtok, tok->encoding);
893 PyMem_FREE(newtok);
894 if (!u) {
895 tok->done = E_DECODE;
896 return EOF;
897 }
898 buflen = PyBytes_GET_SIZE(u);
899 buf = PyBytes_AS_STRING(u);
900 if (!buf) {
901 Py_DECREF(u);
902 tok->done = E_DECODE;
903 return EOF;
904 }
905 newtok = PyMem_MALLOC(buflen+1);
906 strcpy(newtok, buf);
907 Py_DECREF(u);
908 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000909#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910 if (tok->nextprompt != NULL)
911 tok->prompt = tok->nextprompt;
912 if (newtok == NULL)
913 tok->done = E_INTR;
914 else if (*newtok == '\0') {
915 PyMem_FREE(newtok);
916 tok->done = E_EOF;
917 }
918 else if (tok->start != NULL) {
919 size_t start = tok->start - tok->buf;
920 size_t oldlen = tok->cur - tok->buf;
921 size_t newlen = oldlen + strlen(newtok);
922 char *buf = tok->buf;
923 buf = (char *)PyMem_REALLOC(buf, newlen+1);
924 tok->lineno++;
925 if (buf == NULL) {
926 PyMem_FREE(tok->buf);
927 tok->buf = NULL;
928 PyMem_FREE(newtok);
929 tok->done = E_NOMEM;
930 return EOF;
931 }
932 tok->buf = buf;
933 tok->cur = tok->buf + oldlen;
934 tok->line_start = tok->cur;
935 strcpy(tok->buf + oldlen, newtok);
936 PyMem_FREE(newtok);
937 tok->inp = tok->buf + newlen;
938 tok->end = tok->inp + 1;
939 tok->start = tok->buf + start;
940 }
941 else {
942 tok->lineno++;
943 if (tok->buf != NULL)
944 PyMem_FREE(tok->buf);
945 tok->buf = newtok;
946 tok->line_start = tok->buf;
947 tok->cur = tok->buf;
948 tok->line_start = tok->buf;
949 tok->inp = strchr(tok->buf, '\0');
950 tok->end = tok->inp + 1;
951 }
952 }
953 else {
954 int done = 0;
955 Py_ssize_t cur = 0;
956 char *pt;
957 if (tok->start == NULL) {
958 if (tok->buf == NULL) {
959 tok->buf = (char *)
960 PyMem_MALLOC(BUFSIZ);
961 if (tok->buf == NULL) {
962 tok->done = E_NOMEM;
963 return EOF;
964 }
965 tok->end = tok->buf + BUFSIZ;
966 }
967 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
968 tok) == NULL) {
969 tok->done = E_EOF;
970 done = 1;
971 }
972 else {
973 tok->done = E_OK;
974 tok->inp = strchr(tok->buf, '\0');
975 done = tok->inp[-1] == '\n';
976 }
977 }
978 else {
979 cur = tok->cur - tok->buf;
980 if (decoding_feof(tok)) {
981 tok->done = E_EOF;
982 done = 1;
983 }
984 else
985 tok->done = E_OK;
986 }
987 tok->lineno++;
988 /* Read until '\n' or EOF */
989 while (!done) {
990 Py_ssize_t curstart = tok->start == NULL ? -1 :
991 tok->start - tok->buf;
992 Py_ssize_t curvalid = tok->inp - tok->buf;
993 Py_ssize_t newsize = curvalid + BUFSIZ;
994 char *newbuf = tok->buf;
995 newbuf = (char *)PyMem_REALLOC(newbuf,
996 newsize);
997 if (newbuf == NULL) {
998 tok->done = E_NOMEM;
999 tok->cur = tok->inp;
1000 return EOF;
1001 }
1002 tok->buf = newbuf;
1003 tok->inp = tok->buf + curvalid;
1004 tok->end = tok->buf + newsize;
1005 tok->start = curstart < 0 ? NULL :
1006 tok->buf + curstart;
1007 if (decoding_fgets(tok->inp,
1008 (int)(tok->end - tok->inp),
1009 tok) == NULL) {
1010 /* Break out early on decoding
1011 errors, as tok->buf will be NULL
1012 */
1013 if (tok->decoding_erred)
1014 return EOF;
1015 /* Last line does not end in \n,
1016 fake one */
1017 strcpy(tok->inp, "\n");
1018 }
1019 tok->inp = strchr(tok->inp, '\0');
1020 done = tok->inp[-1] == '\n';
1021 }
1022 if (tok->buf != NULL) {
1023 tok->cur = tok->buf + cur;
1024 tok->line_start = tok->cur;
1025 /* replace "\r\n" with "\n" */
1026 /* For Mac leave the \r, giving a syntax error */
1027 pt = tok->inp - 2;
1028 if (pt >= tok->buf && *pt == '\r') {
1029 *pt++ = '\n';
1030 *pt = '\0';
1031 tok->inp = pt;
1032 }
1033 }
1034 }
1035 if (tok->done != E_OK) {
1036 if (tok->prompt != NULL)
1037 PySys_WriteStderr("\n");
1038 tok->cur = tok->inp;
1039 return EOF;
1040 }
1041 }
1042 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001043}
1044
1045
1046/* Back-up one character */
1047
1048static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001049tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001050{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 if (c != EOF) {
1052 if (--tok->cur < tok->buf)
1053 Py_FatalError("tok_backup: beginning of buffer");
1054 if (*tok->cur != c)
1055 *tok->cur = c;
1056 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001057}
1058
1059
1060/* Return the token corresponding to a single character */
1061
1062int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001063PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 switch (c) {
1066 case '(': return LPAR;
1067 case ')': return RPAR;
1068 case '[': return LSQB;
1069 case ']': return RSQB;
1070 case ':': return COLON;
1071 case ',': return COMMA;
1072 case ';': return SEMI;
1073 case '+': return PLUS;
1074 case '-': return MINUS;
1075 case '*': return STAR;
1076 case '/': return SLASH;
1077 case '|': return VBAR;
1078 case '&': return AMPER;
1079 case '<': return LESS;
1080 case '>': return GREATER;
1081 case '=': return EQUAL;
1082 case '.': return DOT;
1083 case '%': return PERCENT;
1084 case '{': return LBRACE;
1085 case '}': return RBRACE;
1086 case '^': return CIRCUMFLEX;
1087 case '~': return TILDE;
1088 case '@': return AT;
1089 default: return OP;
1090 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091}
1092
1093
Guido van Rossumfbab9051991-10-20 20:25:03 +00001094int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001095PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001096{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 switch (c1) {
1098 case '=':
1099 switch (c2) {
1100 case '=': return EQEQUAL;
1101 }
1102 break;
1103 case '!':
1104 switch (c2) {
1105 case '=': return NOTEQUAL;
1106 }
1107 break;
1108 case '<':
1109 switch (c2) {
1110 case '>': return NOTEQUAL;
1111 case '=': return LESSEQUAL;
1112 case '<': return LEFTSHIFT;
1113 }
1114 break;
1115 case '>':
1116 switch (c2) {
1117 case '=': return GREATEREQUAL;
1118 case '>': return RIGHTSHIFT;
1119 }
1120 break;
1121 case '+':
1122 switch (c2) {
1123 case '=': return PLUSEQUAL;
1124 }
1125 break;
1126 case '-':
1127 switch (c2) {
1128 case '=': return MINEQUAL;
1129 case '>': return RARROW;
1130 }
1131 break;
1132 case '*':
1133 switch (c2) {
1134 case '*': return DOUBLESTAR;
1135 case '=': return STAREQUAL;
1136 }
1137 break;
1138 case '/':
1139 switch (c2) {
1140 case '/': return DOUBLESLASH;
1141 case '=': return SLASHEQUAL;
1142 }
1143 break;
1144 case '|':
1145 switch (c2) {
1146 case '=': return VBAREQUAL;
1147 }
1148 break;
1149 case '%':
1150 switch (c2) {
1151 case '=': return PERCENTEQUAL;
1152 }
1153 break;
1154 case '&':
1155 switch (c2) {
1156 case '=': return AMPEREQUAL;
1157 }
1158 break;
1159 case '^':
1160 switch (c2) {
1161 case '=': return CIRCUMFLEXEQUAL;
1162 }
1163 break;
1164 }
1165 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001166}
1167
Thomas Wouters434d0822000-08-24 20:11:32 +00001168int
1169PyToken_ThreeChars(int c1, int c2, int c3)
1170{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 switch (c1) {
1172 case '<':
1173 switch (c2) {
1174 case '<':
1175 switch (c3) {
1176 case '=':
1177 return LEFTSHIFTEQUAL;
1178 }
1179 break;
1180 }
1181 break;
1182 case '>':
1183 switch (c2) {
1184 case '>':
1185 switch (c3) {
1186 case '=':
1187 return RIGHTSHIFTEQUAL;
1188 }
1189 break;
1190 }
1191 break;
1192 case '*':
1193 switch (c2) {
1194 case '*':
1195 switch (c3) {
1196 case '=':
1197 return DOUBLESTAREQUAL;
1198 }
1199 break;
1200 }
1201 break;
1202 case '/':
1203 switch (c2) {
1204 case '/':
1205 switch (c3) {
1206 case '=':
1207 return DOUBLESLASHEQUAL;
1208 }
1209 break;
1210 }
1211 break;
1212 case '.':
1213 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001214 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 switch (c3) {
1216 case '.':
1217 return ELLIPSIS;
1218 }
1219 break;
1220 }
1221 break;
1222 }
1223 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001224}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001225
Guido van Rossum926f13a1998-04-09 21:38:06 +00001226static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001227indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 if (tok->alterror) {
1230 tok->done = E_TABSPACE;
1231 tok->cur = tok->inp;
1232 return 1;
1233 }
1234 if (tok->altwarning) {
1235 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1236 "in indentation\n", tok->filename);
1237 tok->altwarning = 0;
1238 }
1239 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001240}
1241
Martin v. Löwis47383402007-08-15 07:32:56 +00001242#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001243#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001244#else
1245/* Verify that the identifier follows PEP 3131. */
1246static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001247verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001248{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 PyObject *s;
1250 int result;
1251 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1252 if (s == NULL) {
1253 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1254 PyErr_Clear();
1255 tok->done = E_IDENTIFIER;
1256 } else {
1257 tok->done = E_ERROR;
1258 }
1259 return 0;
1260 }
1261 result = PyUnicode_IsIdentifier(s);
1262 Py_DECREF(s);
1263 if (result == 0)
1264 tok->done = E_IDENTIFIER;
1265 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001266}
1267#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001268
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269/* Get next token, after space stripping etc. */
1270
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001271static int
1272tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 register int c;
1275 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001276
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001278 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 tok->start = NULL;
1280 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001281
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 /* Get indentation level */
1283 if (tok->atbol) {
1284 register int col = 0;
1285 register int altcol = 0;
1286 tok->atbol = 0;
1287 for (;;) {
1288 c = tok_nextc(tok);
1289 if (c == ' ')
1290 col++, altcol++;
1291 else if (c == '\t') {
1292 col = (col/tok->tabsize + 1) * tok->tabsize;
1293 altcol = (altcol/tok->alttabsize + 1)
1294 * tok->alttabsize;
1295 }
1296 else if (c == '\014') /* Control-L (formfeed) */
1297 col = altcol = 0; /* For Emacs users */
1298 else
1299 break;
1300 }
1301 tok_backup(tok, c);
1302 if (c == '#' || c == '\n') {
1303 /* Lines with only whitespace and/or comments
1304 shouldn't affect the indentation and are
1305 not passed to the parser as NEWLINE tokens,
1306 except *totally* empty lines in interactive
1307 mode, which signal the end of a command group. */
1308 if (col == 0 && c == '\n' && tok->prompt != NULL)
1309 blankline = 0; /* Let it through */
1310 else
1311 blankline = 1; /* Ignore completely */
1312 /* We can't jump back right here since we still
1313 may need to skip to the end of a comment */
1314 }
1315 if (!blankline && tok->level == 0) {
1316 if (col == tok->indstack[tok->indent]) {
1317 /* No change */
1318 if (altcol != tok->altindstack[tok->indent]) {
1319 if (indenterror(tok))
1320 return ERRORTOKEN;
1321 }
1322 }
1323 else if (col > tok->indstack[tok->indent]) {
1324 /* Indent -- always one */
1325 if (tok->indent+1 >= MAXINDENT) {
1326 tok->done = E_TOODEEP;
1327 tok->cur = tok->inp;
1328 return ERRORTOKEN;
1329 }
1330 if (altcol <= tok->altindstack[tok->indent]) {
1331 if (indenterror(tok))
1332 return ERRORTOKEN;
1333 }
1334 tok->pendin++;
1335 tok->indstack[++tok->indent] = col;
1336 tok->altindstack[tok->indent] = altcol;
1337 }
1338 else /* col < tok->indstack[tok->indent] */ {
1339 /* Dedent -- any number, must be consistent */
1340 while (tok->indent > 0 &&
1341 col < tok->indstack[tok->indent]) {
1342 tok->pendin--;
1343 tok->indent--;
1344 }
1345 if (col != tok->indstack[tok->indent]) {
1346 tok->done = E_DEDENT;
1347 tok->cur = tok->inp;
1348 return ERRORTOKEN;
1349 }
1350 if (altcol != tok->altindstack[tok->indent]) {
1351 if (indenterror(tok))
1352 return ERRORTOKEN;
1353 }
1354 }
1355 }
1356 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001357
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001358 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 /* Return pending indents/dedents */
1361 if (tok->pendin != 0) {
1362 if (tok->pendin < 0) {
1363 tok->pendin++;
1364 return DEDENT;
1365 }
1366 else {
1367 tok->pendin--;
1368 return INDENT;
1369 }
1370 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 tok->start = NULL;
1374 /* Skip spaces */
1375 do {
1376 c = tok_nextc(tok);
1377 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 /* Set start of current token */
1380 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 /* Skip comment */
1383 if (c == '#')
1384 while (c != EOF && c != '\n')
1385 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 /* Check for EOF and errors now */
1388 if (c == EOF) {
1389 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1390 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001391
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 /* Identifier (most frequent token!) */
1393 nonascii = 0;
1394 if (is_potential_identifier_start(c)) {
1395 /* Process b"", r"" and br"" */
1396 if (c == 'b' || c == 'B') {
1397 c = tok_nextc(tok);
1398 if (c == '"' || c == '\'')
1399 goto letter_quote;
1400 }
1401 if (c == 'r' || c == 'R') {
1402 c = tok_nextc(tok);
1403 if (c == '"' || c == '\'')
1404 goto letter_quote;
1405 }
1406 while (is_potential_identifier_char(c)) {
1407 if (c >= 128)
1408 nonascii = 1;
1409 c = tok_nextc(tok);
1410 }
1411 tok_backup(tok, c);
1412 if (nonascii &&
1413 !verify_identifier(tok)) {
1414 tok->done = E_IDENTIFIER;
1415 return ERRORTOKEN;
1416 }
1417 *p_start = tok->start;
1418 *p_end = tok->cur;
1419 return NAME;
1420 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 /* Newline */
1423 if (c == '\n') {
1424 tok->atbol = 1;
1425 if (blankline || tok->level > 0)
1426 goto nextline;
1427 *p_start = tok->start;
1428 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1429 tok->cont_line = 0;
1430 return NEWLINE;
1431 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 /* Period or number starting with period? */
1434 if (c == '.') {
1435 c = tok_nextc(tok);
1436 if (isdigit(c)) {
1437 goto fraction;
1438 } else if (c == '.') {
1439 c = tok_nextc(tok);
1440 if (c == '.') {
1441 *p_start = tok->start;
1442 *p_end = tok->cur;
1443 return ELLIPSIS;
1444 } else {
1445 tok_backup(tok, c);
1446 }
1447 tok_backup(tok, '.');
1448 } else {
1449 tok_backup(tok, c);
1450 }
1451 *p_start = tok->start;
1452 *p_end = tok->cur;
1453 return DOT;
1454 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001455
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 /* Number */
1457 if (isdigit(c)) {
1458 if (c == '0') {
1459 /* Hex, octal or binary -- maybe. */
1460 c = tok_nextc(tok);
1461 if (c == '.')
1462 goto fraction;
1463 if (c == 'j' || c == 'J')
1464 goto imaginary;
1465 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001466
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001467 /* Hex */
1468 c = tok_nextc(tok);
1469 if (!isxdigit(c)) {
1470 tok->done = E_TOKEN;
1471 tok_backup(tok, c);
1472 return ERRORTOKEN;
1473 }
1474 do {
1475 c = tok_nextc(tok);
1476 } while (isxdigit(c));
1477 }
1478 else if (c == 'o' || c == 'O') {
1479 /* Octal */
1480 c = tok_nextc(tok);
1481 if (c < '0' || c >= '8') {
1482 tok->done = E_TOKEN;
1483 tok_backup(tok, c);
1484 return ERRORTOKEN;
1485 }
1486 do {
1487 c = tok_nextc(tok);
1488 } while ('0' <= c && c < '8');
1489 }
1490 else if (c == 'b' || c == 'B') {
1491 /* Binary */
1492 c = tok_nextc(tok);
1493 if (c != '0' && c != '1') {
1494 tok->done = E_TOKEN;
1495 tok_backup(tok, c);
1496 return ERRORTOKEN;
1497 }
1498 do {
1499 c = tok_nextc(tok);
1500 } while (c == '0' || c == '1');
1501 }
1502 else {
1503 int nonzero = 0;
1504 /* maybe old-style octal; c is first char of it */
1505 /* in any case, allow '0' as a literal */
1506 while (c == '0')
1507 c = tok_nextc(tok);
1508 while (isdigit(c)) {
1509 nonzero = 1;
1510 c = tok_nextc(tok);
1511 }
1512 if (c == '.')
1513 goto fraction;
1514 else if (c == 'e' || c == 'E')
1515 goto exponent;
1516 else if (c == 'j' || c == 'J')
1517 goto imaginary;
1518 else if (nonzero) {
1519 tok->done = E_TOKEN;
1520 tok_backup(tok, c);
1521 return ERRORTOKEN;
1522 }
1523 }
1524 }
1525 else {
1526 /* Decimal */
1527 do {
1528 c = tok_nextc(tok);
1529 } while (isdigit(c));
1530 {
1531 /* Accept floating point numbers. */
1532 if (c == '.') {
1533 fraction:
1534 /* Fraction */
1535 do {
1536 c = tok_nextc(tok);
1537 } while (isdigit(c));
1538 }
1539 if (c == 'e' || c == 'E') {
1540 exponent:
1541 /* Exponent part */
1542 c = tok_nextc(tok);
1543 if (c == '+' || c == '-')
1544 c = tok_nextc(tok);
1545 if (!isdigit(c)) {
1546 tok->done = E_TOKEN;
1547 tok_backup(tok, c);
1548 return ERRORTOKEN;
1549 }
1550 do {
1551 c = tok_nextc(tok);
1552 } while (isdigit(c));
1553 }
1554 if (c == 'j' || c == 'J')
1555 /* Imaginary part */
1556 imaginary:
1557 c = tok_nextc(tok);
1558 }
1559 }
1560 tok_backup(tok, c);
1561 *p_start = tok->start;
1562 *p_end = tok->cur;
1563 return NUMBER;
1564 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001565
1566 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 /* String */
1568 if (c == '\'' || c == '"') {
1569 int quote = c;
1570 int quote_size = 1; /* 1 or 3 */
1571 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001572
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001573 /* Find the quote size and start of string */
1574 c = tok_nextc(tok);
1575 if (c == quote) {
1576 c = tok_nextc(tok);
1577 if (c == quote)
1578 quote_size = 3;
1579 else
1580 end_quote_size = 1; /* empty string found */
1581 }
1582 if (c != quote)
1583 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001584
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 /* Get rest of string */
1586 while (end_quote_size != quote_size) {
1587 c = tok_nextc(tok);
1588 if (c == EOF) {
1589 if (quote_size == 3)
1590 tok->done = E_EOFS;
1591 else
1592 tok->done = E_EOLS;
1593 tok->cur = tok->inp;
1594 return ERRORTOKEN;
1595 }
1596 if (quote_size == 1 && c == '\n') {
1597 tok->done = E_EOLS;
1598 tok->cur = tok->inp;
1599 return ERRORTOKEN;
1600 }
1601 if (c == quote)
1602 end_quote_size += 1;
1603 else {
1604 end_quote_size = 0;
1605 if (c == '\\')
1606 c = tok_nextc(tok); /* skip escaped char */
1607 }
1608 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001609
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 *p_start = tok->start;
1611 *p_end = tok->cur;
1612 return STRING;
1613 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001614
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001615 /* Line continuation */
1616 if (c == '\\') {
1617 c = tok_nextc(tok);
1618 if (c != '\n') {
1619 tok->done = E_LINECONT;
1620 tok->cur = tok->inp;
1621 return ERRORTOKEN;
1622 }
1623 tok->cont_line = 1;
1624 goto again; /* Read next line */
1625 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 /* Check for two-character token */
1628 {
1629 int c2 = tok_nextc(tok);
1630 int token = PyToken_TwoChars(c, c2);
1631 if (token != OP) {
1632 int c3 = tok_nextc(tok);
1633 int token3 = PyToken_ThreeChars(c, c2, c3);
1634 if (token3 != OP) {
1635 token = token3;
1636 } else {
1637 tok_backup(tok, c3);
1638 }
1639 *p_start = tok->start;
1640 *p_end = tok->cur;
1641 return token;
1642 }
1643 tok_backup(tok, c2);
1644 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001645
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 /* Keep track of parentheses nesting level */
1647 switch (c) {
1648 case '(':
1649 case '[':
1650 case '{':
1651 tok->level++;
1652 break;
1653 case ')':
1654 case ']':
1655 case '}':
1656 tok->level--;
1657 break;
1658 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001659
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 /* Punctuation character */
1661 *p_start = tok->start;
1662 *p_end = tok->cur;
1663 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001664}
1665
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001666int
1667PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1668{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 int result = tok_get(tok, p_start, p_end);
1670 if (tok->decoding_erred) {
1671 result = ERRORTOKEN;
1672 tok->done = E_DECODE;
1673 }
1674 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001675}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001676
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001677/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001678
1679 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001680 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001681 should be assumed to be PyUnicode_GetDefaultEncoding()).
1682
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001683 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1684 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001685*/
1686char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001687PyTokenizer_FindEncoding(int fd)
1688{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 struct tok_state *tok;
1690 FILE *fp;
1691 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001692
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 fd = dup(fd);
1694 if (fd < 0) {
1695 return NULL;
1696 }
1697 fp = fdopen(fd, "r");
1698 if (fp == NULL) {
1699 return NULL;
1700 }
1701 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1702 if (tok == NULL) {
1703 fclose(fp);
1704 return NULL;
1705 }
1706 while (tok->lineno < 2 && tok->done == E_OK) {
1707 PyTokenizer_Get(tok, &p_start, &p_end);
1708 }
1709 fclose(fp);
1710 if (tok->encoding) {
1711 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1712 if (encoding)
1713 strcpy(encoding, tok->encoding);
1714 }
1715 PyTokenizer_Free(tok);
1716 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001717}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001718
Guido van Rossum408027e1996-12-30 16:17:54 +00001719#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001720
1721void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001722tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001723{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 printf("%s", _PyParser_TokenNames[type]);
1725 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1726 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001727}
1728
1729#endif