blob: 1c2b8e8e107e4c8f1bfb5cff68d0f19c095191a2 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Brett Cannond5ec98c2007-10-20 02:54:14 +000055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056/* Token names */
57
Guido van Rossum86bea461997-04-29 21:03:06 +000058char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059 "ENDMARKER",
60 "NAME",
61 "NUMBER",
62 "STRING",
63 "NEWLINE",
64 "INDENT",
65 "DEDENT",
66 "LPAR",
67 "RPAR",
68 "LSQB",
69 "RSQB",
70 "COLON",
71 "COMMA",
72 "SEMI",
73 "PLUS",
74 "MINUS",
75 "STAR",
76 "SLASH",
77 "VBAR",
78 "AMPER",
79 "LESS",
80 "GREATER",
81 "EQUAL",
82 "DOT",
83 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "LBRACE",
85 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 "EQEQUAL",
87 "NOTEQUAL",
88 "LESSEQUAL",
89 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000090 "TILDE",
91 "CIRCUMFLEX",
92 "LEFTSHIFT",
93 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000094 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000095 "PLUSEQUAL",
96 "MINEQUAL",
97 "STAREQUAL",
98 "SLASHEQUAL",
99 "PERCENTEQUAL",
100 "AMPEREQUAL",
101 "VBAREQUAL",
102 "CIRCUMFLEXEQUAL",
103 "LEFTSHIFTEQUAL",
104 "RIGHTSHIFTEQUAL",
105 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000106 "DOUBLESLASH",
107 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000108 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000109 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000110 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000111 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 "OP",
113 "<ERRORTOKEN>",
114 "<N_TOKENS>"
115};
116
117
118/* Create and initialize a new tok_state structure */
119
120static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000123 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
124 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 if (tok == NULL)
126 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000127 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 tok->done = E_OK;
129 tok->fp = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000137 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000143 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000149 tok->decoding_readline = NULL;
150 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000151#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000152 return tok;
153}
154
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000155#ifdef PGEN
156
157static char *
158decoding_fgets(char *s, int size, struct tok_state *tok)
159{
160 return fgets(s, size, tok->fp);
161}
162
163static int
164decoding_feof(struct tok_state *tok)
165{
166 return feof(tok->fp);
167}
168
169static const char *
170decode_str(const char *str, struct tok_state *tok)
171{
172 return str;
173}
174
175#else /* PGEN */
176
177static char *
178error_ret(struct tok_state *tok) /* XXX */
179{
180 tok->decoding_erred = 1;
181 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 tok->buf = NULL;
184 return NULL; /* as if it were EOF */
185}
186
187static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 if (result != NULL) {
192 memcpy(result, s, len);
193 result[len] = '\0';
194 }
195 return result;
196}
197
198static char *
199get_normal_name(char *s) /* for utf-8 and latin-1 */
200{
201 char buf[13];
202 int i;
203 for (i = 0; i < 12; i++) {
204 int c = s[i];
205 if (c == '\0') break;
206 else if (c == '_') buf[i] = '-';
207 else buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
212 else if (strcmp(buf, "latin-1") == 0 ||
213 strcmp(buf, "iso-8859-1") == 0 ||
214 strcmp(buf, "iso-latin-1") == 0 ||
215 strncmp(buf, "latin-1-", 8) == 0 ||
216 strncmp(buf, "iso-8859-1-", 11) == 0 ||
217 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
218 else return s;
219}
220
221/* Return the coding spec in S, or NULL if none is found. */
222
223static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000224get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000225{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000226 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000227 /* Coding spec must be in a comment, and that comment must be
228 * the only statement on the source code line. */
229 for (i = 0; i < size - 6; i++) {
230 if (s[i] == '#')
231 break;
232 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
233 return NULL;
234 }
235 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236 const char* t = s + i;
237 if (strncmp(t, "coding", 6) == 0) {
238 const char* begin = NULL;
239 t += 6;
240 if (t[0] != ':' && t[0] != '=')
241 continue;
242 do {
243 t++;
244 } while (t[0] == '\x20' || t[0] == '\t');
245
246 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000247 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000248 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 t++;
250
251 if (begin < t) {
252 char* r = new_string(begin, t - begin);
253 char* q = get_normal_name(r);
254 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000256 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 }
258 return r;
259 }
260 }
261 }
262 return NULL;
263}
264
265/* Check whether the line contains a coding spec. If it does,
266 invoke the set_readline function for the new encoding.
267 This function receives the tok_state and the new encoding.
268 Return 1 on success, 0 on failure. */
269
270static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000271check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int set_readline(struct tok_state *, const char *))
273{
Tim Peters17db21f2002-09-03 15:39:58 +0000274 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000276
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000277 if (tok->cont_line)
278 /* It's a continuation line, so it can't be a coding spec. */
279 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000280 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 if (cs != NULL) {
282 tok->read_coding_spec = 1;
283 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 if (strcmp(cs, "utf-8") == 0 ||
286 strcmp(cs, "iso-8859-1") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
326 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
327 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
328#if 0
329 /* Disable support for UTF-16 BOMs until a decision
330 is made whether this needs to be supported. */
331 } else if (ch == 0xFE) {
332 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
333 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000334 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335 } else if (ch == 0xFF) {
336 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
337 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000338 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339#endif
340 } else {
341 unget_char(ch, tok);
342 return 1;
343 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000344 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000345 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
347 return 1;
348 NON_BOM:
349 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
350 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
351 return 1;
352}
353
354/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 On entry, tok->decoding_buffer will be one of:
358 1) NULL: need to call tok->decoding_readline to get a new line
359 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
360 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000361 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000362 (in the s buffer) to copy entire contents of the line read
363 by tok->decoding_readline. tok->decoding_buffer has the overflow.
364 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 reached): see tok_nextc and its calls to decoding_fgets.
367*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368
369static char *
370fp_readl(char *s, int size, struct tok_state *tok)
371{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000372 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000373 const char *buf;
374 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000380 if (tok->decoding_buffer) {
381 bufobj = tok->decoding_buffer;
382 Py_INCREF(bufobj);
383 }
384 else
385 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000386 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
387 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000388 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000390 if (PyUnicode_CheckExact(bufobj))
391 {
392 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
393 if (buf == NULL) {
394 goto error;
395 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000396 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000397 else
398 {
399 buf = PyBytes_AsString(bufobj);
400 if (buf == NULL) {
401 goto error;
402 }
403 buflen = PyBytes_GET_SIZE(bufobj);
404 }
405
406 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000407 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000408 /* Too many chars, the rest goes into tok->decoding_buffer */
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000409 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
410 buflen-size);
411 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000412 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000413 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000414 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000415 else
416 tok->decoding_buffer = NULL;
417
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000418 memcpy(s, buf, buflen);
419 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000420 if (buflen == 0) /* EOF */
421 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000422 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000423 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000424
425error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000427 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428}
429
430/* Set the readline function for TOK to a StreamReader's
431 readline function. The StreamReader is named ENC.
432
433 This function is called from check_bom and check_coding_spec.
434
435 ENC is usually identical to the future value of tok->encoding,
436 except for the (currently unsupported) case of UTF-16.
437
438 Return 1 on success, 0 on failure. */
439
440static int
441fp_setreadl(struct tok_state *tok, const char* enc)
442{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000443 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000445 io = PyImport_ImportModule("io");
446 if (io == NULL)
447 goto cleanup;
448
449 stream = PyObject_CallMethod(io, "open", "ssis",
450 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000451 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000452 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000454 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000455 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000456 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000457
458 cleanup:
459 Py_XDECREF(stream);
460 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000461 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462}
463
464/* Fetch the next byte from TOK. */
465
466static int fp_getc(struct tok_state *tok) {
467 return getc(tok->fp);
468}
469
470/* Unfetch the last byte back into TOK. */
471
472static void fp_ungetc(int c, struct tok_state *tok) {
473 ungetc(c, tok->fp);
474}
475
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000476/* Check whether the characters at s start a valid
477 UTF-8 sequence. Return the number of characters forming
478 the sequence if yes, 0 if not. */
479static int valid_utf8(const unsigned char* s)
480{
481 int expected = 0;
482 int length;
483 if (*s < 0x80)
484 /* single-byte code */
485 return 1;
486 if (*s < 0xc0)
487 /* following byte */
488 return 0;
489 if (*s < 0xE0)
490 expected = 1;
491 else if (*s < 0xF0)
492 expected = 2;
493 else if (*s < 0xF8)
494 expected = 3;
495 else
496 return 0;
497 length = expected + 1;
498 for (; expected; expected--)
499 if (s[expected] < 0x80 || s[expected] >= 0xC0)
500 return 0;
501 return length;
502}
503
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504/* Read a line of input from TOK. Determine encoding
505 if necessary. */
506
507static char *
508decoding_fgets(char *s, int size, struct tok_state *tok)
509{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000510 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000511 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000512 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000513 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514 /* We already have a codec associated with
515 this input. */
516 line = fp_readl(s, size, tok);
517 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000518 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000519 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000520 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000522 break;
523 } else {
524 /* We have not yet determined the encoding.
525 If an encoding is found, use the file-pointer
526 reader functions from now on. */
527 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
528 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000529 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000531 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
533 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
534 return error_ret(tok);
535 }
536 }
537#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000538 /* The default encoding is UTF-8, so make sure we don't have any
539 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000540 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000542 int length;
543 for (c = (unsigned char *)line; *c; c += length)
544 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545 badchar = *c;
546 break;
547 }
548 }
549 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000550 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000551 /* Need to add 1 to the line number, since this line
552 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000553 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000554 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000555 "in file %.200s on line %i, "
556 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000557 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000558 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000559 PyErr_SetString(PyExc_SyntaxError, buf);
560 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000561 }
562#endif
563 return line;
564}
565
566static int
567decoding_feof(struct tok_state *tok)
568{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000569 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000570 return feof(tok->fp);
571 } else {
572 PyObject* buf = tok->decoding_buffer;
573 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000574 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575 if (buf == NULL) {
576 error_ret(tok);
577 return 1;
578 } else {
579 tok->decoding_buffer = buf;
580 }
581 }
582 return PyObject_Length(buf) == 0;
583 }
584}
585
586/* Fetch a byte from TOK, using the string buffer. */
587
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588static int
589buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000590 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591}
592
593/* Unfetch a byte from TOK, using the string buffer. */
594
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000595static void
596buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000598 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599}
600
601/* Set the readline function for TOK to ENC. For the string-based
602 tokenizer, this means to just record the encoding. */
603
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000604static int
605buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606 tok->enc = enc;
607 return 1;
608}
609
610/* Return a UTF-8 encoding Python string object from the
611 C byte string STR, which is encoded with ENC. */
612
613static PyObject *
614translate_into_utf8(const char* str, const char* enc) {
615 PyObject *utf8;
616 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
617 if (buf == NULL)
618 return NULL;
619 utf8 = PyUnicode_AsUTF8String(buf);
620 Py_DECREF(buf);
621 return utf8;
622}
623
624/* Decode a byte string STR for use as the buffer of TOK.
625 Look for encoding declarations inside STR, and record them
626 inside TOK. */
627
628static const char *
629decode_str(const char *str, struct tok_state *tok)
630{
631 PyObject* utf8 = NULL;
632 const char *s;
633 int lineno = 0;
634 tok->enc = NULL;
635 tok->str = str;
636 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000637 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000639 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 if (tok->enc != NULL) {
641 utf8 = translate_into_utf8(str, tok->enc);
642 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000643 return error_ret(tok);
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000644 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645 }
646 for (s = str;; s++) {
647 if (*s == '\0') break;
648 else if (*s == '\n') {
649 lineno++;
650 if (lineno == 2) break;
651 }
652 }
653 tok->enc = NULL;
654 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000655 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656 if (tok->enc != NULL) {
657 assert(utf8 == NULL);
658 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000659 if (utf8 == NULL) {
660 PyErr_Format(PyExc_SyntaxError,
661 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000662 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000663 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000664 str = PyString_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 }
666 assert(tok->decoding_buffer == NULL);
667 tok->decoding_buffer = utf8; /* CAUTION */
668 return str;
669}
670
671#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672
673/* Set up tokenizer for string */
674
675struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000676PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677{
678 struct tok_state *tok = tok_new();
679 if (tok == NULL)
680 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000681 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000682 if (str == NULL) {
683 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000685 }
686
Martin v. Löwis95292d62002-12-11 14:04:59 +0000687 /* XXX: constify members. */
688 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000689 return tok;
690}
691
692
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000693/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000694
695struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000696PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000697{
698 struct tok_state *tok = tok_new();
699 if (tok == NULL)
700 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000701 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000702 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703 return NULL;
704 }
705 tok->cur = tok->inp = tok->buf;
706 tok->end = tok->buf + BUFSIZ;
707 tok->fp = fp;
708 tok->prompt = ps1;
709 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000710 if (enc != NULL) {
711 /* Must copy encoding declaration since it
712 gets copied into the parse tree. */
713 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
714 if (!tok->encoding) {
715 PyTokenizer_Free(tok);
716 return NULL;
717 }
718 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000719 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000720 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000721 return tok;
722}
723
724
725/* Free a tok_state structure */
726
727void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000728PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000730 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000731 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000732#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000733 Py_XDECREF(tok->decoding_readline);
734 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000735#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000736 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000737 PyMem_FREE(tok->buf);
738 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739}
740
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741/* Get next char, updating state; error code goes into tok->done */
742
743static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000744tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000745{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000747 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000748 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000749 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000750 if (tok->done != E_OK)
751 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000753 char *end = strchr(tok->inp, '\n');
754 if (end != NULL)
755 end++;
756 else {
757 end = strchr(tok->inp, '\0');
758 if (end == tok->inp) {
759 tok->done = E_EOF;
760 return EOF;
761 }
762 }
763 if (tok->start == NULL)
764 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000765 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000766 tok->lineno++;
767 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000768 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000771 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000772#ifndef PGEN
773 if (tok->encoding && newtok && *newtok) {
774 /* Recode to UTF-8 */
775 Py_ssize_t buflen;
776 const char* buf;
777 PyObject *u = translate_into_utf8(newtok, tok->encoding);
778 PyMem_FREE(newtok);
779 if (!u) {
780 tok->done = E_DECODE;
781 return EOF;
782 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000783 buflen = PyString_GET_SIZE(u);
784 buf = PyString_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000785 if (!buf) {
786 Py_DECREF(u);
787 tok->done = E_DECODE;
788 return EOF;
789 }
790 newtok = PyMem_MALLOC(buflen+1);
791 strcpy(newtok, buf);
792 Py_DECREF(u);
793 }
794#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000795 if (tok->nextprompt != NULL)
796 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000797 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000798 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000799 else if (*newtok == '\0') {
800 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000801 tok->done = E_EOF;
802 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000804 size_t start = tok->start - tok->buf;
805 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000806 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000807 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000808 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000809 tok->lineno++;
810 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000811 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000812 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000813 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000814 tok->done = E_NOMEM;
815 return EOF;
816 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 tok->buf = buf;
818 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000819 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000820 strcpy(tok->buf + oldlen, newtok);
821 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->inp = tok->buf + newlen;
823 tok->end = tok->inp + 1;
824 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000825 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000826 else {
827 tok->lineno++;
828 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829 PyMem_FREE(tok->buf);
830 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000831 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000833 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 tok->inp = strchr(tok->buf, '\0');
835 tok->end = tok->inp + 1;
836 }
837 }
838 else {
839 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000840 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000841 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 if (tok->start == NULL) {
843 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000844 tok->buf = (char *)
845 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 if (tok->buf == NULL) {
847 tok->done = E_NOMEM;
848 return EOF;
849 }
850 tok->end = tok->buf + BUFSIZ;
851 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000852 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
853 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 tok->done = E_EOF;
855 done = 1;
856 }
857 else {
858 tok->done = E_OK;
859 tok->inp = strchr(tok->buf, '\0');
860 done = tok->inp[-1] == '\n';
861 }
862 }
863 else {
864 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000865 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000866 tok->done = E_EOF;
867 done = 1;
868 }
869 else
870 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000871 }
872 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000875 Py_ssize_t curstart = tok->start == NULL ? -1 :
876 tok->start - tok->buf;
877 Py_ssize_t curvalid = tok->inp - tok->buf;
878 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000879 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000880 newbuf = (char *)PyMem_REALLOC(newbuf,
881 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 if (newbuf == NULL) {
883 tok->done = E_NOMEM;
884 tok->cur = tok->inp;
885 return EOF;
886 }
887 tok->buf = newbuf;
888 tok->inp = tok->buf + curvalid;
889 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 tok->start = curstart < 0 ? NULL :
891 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000892 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000894 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000895 /* Break out early on decoding
896 errors, as tok->buf will be NULL
897 */
898 if (tok->decoding_erred)
899 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000900 /* Last line does not end in \n,
901 fake one */
902 strcpy(tok->inp, "\n");
903 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000904 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000905 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000906 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000907 if (tok->buf != NULL) {
908 tok->cur = tok->buf + cur;
909 tok->line_start = tok->cur;
910 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000911 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000912 pt = tok->inp - 2;
913 if (pt >= tok->buf && *pt == '\r') {
914 *pt++ = '\n';
915 *pt = '\0';
916 tok->inp = pt;
917 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000918 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919 }
920 if (tok->done != E_OK) {
921 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000922 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000923 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000924 return EOF;
925 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000926 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000927 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000928}
929
930
931/* Back-up one character */
932
933static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000934tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000935{
936 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000937 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000938 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000939 if (*tok->cur != c)
940 *tok->cur = c;
941 }
942}
943
944
945/* Return the token corresponding to a single character */
946
947int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000948PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949{
950 switch (c) {
951 case '(': return LPAR;
952 case ')': return RPAR;
953 case '[': return LSQB;
954 case ']': return RSQB;
955 case ':': return COLON;
956 case ',': return COMMA;
957 case ';': return SEMI;
958 case '+': return PLUS;
959 case '-': return MINUS;
960 case '*': return STAR;
961 case '/': return SLASH;
962 case '|': return VBAR;
963 case '&': return AMPER;
964 case '<': return LESS;
965 case '>': return GREATER;
966 case '=': return EQUAL;
967 case '.': return DOT;
968 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000969 case '{': return LBRACE;
970 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000971 case '^': return CIRCUMFLEX;
972 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000973 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000974 default: return OP;
975 }
976}
977
978
Guido van Rossumfbab9051991-10-20 20:25:03 +0000979int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000980PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000981{
982 switch (c1) {
983 case '=':
984 switch (c2) {
985 case '=': return EQEQUAL;
986 }
987 break;
988 case '!':
989 switch (c2) {
990 case '=': return NOTEQUAL;
991 }
992 break;
993 case '<':
994 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000995 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000996 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000997 }
998 break;
999 case '>':
1000 switch (c2) {
1001 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001002 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001003 }
1004 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001005 case '+':
1006 switch (c2) {
1007 case '=': return PLUSEQUAL;
1008 }
1009 break;
1010 case '-':
1011 switch (c2) {
1012 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001013 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001014 }
1015 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001016 case '*':
1017 switch (c2) {
1018 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 case '=': return STAREQUAL;
1020 }
1021 break;
1022 case '/':
1023 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001024 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001025 case '=': return SLASHEQUAL;
1026 }
1027 break;
1028 case '|':
1029 switch (c2) {
1030 case '=': return VBAREQUAL;
1031 }
1032 break;
1033 case '%':
1034 switch (c2) {
1035 case '=': return PERCENTEQUAL;
1036 }
1037 break;
1038 case '&':
1039 switch (c2) {
1040 case '=': return AMPEREQUAL;
1041 }
1042 break;
1043 case '^':
1044 switch (c2) {
1045 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001046 }
1047 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001048 }
1049 return OP;
1050}
1051
Thomas Wouters434d0822000-08-24 20:11:32 +00001052int
1053PyToken_ThreeChars(int c1, int c2, int c3)
1054{
1055 switch (c1) {
1056 case '<':
1057 switch (c2) {
1058 case '<':
1059 switch (c3) {
1060 case '=':
1061 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001062 }
1063 break;
1064 }
1065 break;
1066 case '>':
1067 switch (c2) {
1068 case '>':
1069 switch (c3) {
1070 case '=':
1071 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001072 }
1073 break;
1074 }
1075 break;
1076 case '*':
1077 switch (c2) {
1078 case '*':
1079 switch (c3) {
1080 case '=':
1081 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001082 }
1083 break;
1084 }
1085 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001086 case '/':
1087 switch (c2) {
1088 case '/':
1089 switch (c3) {
1090 case '=':
1091 return DOUBLESLASHEQUAL;
1092 }
1093 break;
1094 }
1095 break;
Georg Brandldde00282007-03-18 19:01:53 +00001096 case '.':
1097 switch (c2) {
1098 case '.':
1099 switch (c3) {
1100 case '.':
1101 return ELLIPSIS;
1102 }
1103 break;
1104 }
1105 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001106 }
1107 return OP;
1108}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001109
Guido van Rossum926f13a1998-04-09 21:38:06 +00001110static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001111indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112{
1113 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001114 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001115 tok->cur = tok->inp;
1116 return 1;
1117 }
1118 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001119 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1120 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001121 tok->altwarning = 0;
1122 }
1123 return 0;
1124}
1125
Martin v. Löwis47383402007-08-15 07:32:56 +00001126#ifdef PGEN
1127#define verify_identifier(s,e) 1
1128#else
1129/* Verify that the identifier follows PEP 3131. */
1130static int
1131verify_identifier(char *start, char *end)
1132{
Guido van Rossume3e37012007-08-29 18:54:41 +00001133 PyObject *s;
1134 int result;
1135 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1136 if (s == NULL) {
1137 PyErr_Clear();
1138 return 0;
1139 }
1140 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001141 Py_DECREF(s);
1142 return result;
1143}
1144#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001145
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146/* Get next token, after space stripping etc. */
1147
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001148static int
1149tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001150{
1151 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001152 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001154 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001155 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001156 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001157 blankline = 0;
1158
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 /* Get indentation level */
1160 if (tok->atbol) {
1161 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001162 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001163 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164 for (;;) {
1165 c = tok_nextc(tok);
1166 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001167 col++, altcol++;
1168 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001169 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170 altcol = (altcol/tok->alttabsize + 1)
1171 * tok->alttabsize;
1172 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001173 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001174 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175 else
1176 break;
1177 }
1178 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 if (c == '#' || c == '\n') {
1180 /* Lines with only whitespace and/or comments
1181 shouldn't affect the indentation and are
1182 not passed to the parser as NEWLINE tokens,
1183 except *totally* empty lines in interactive
1184 mode, which signal the end of a command group. */
1185 if (col == 0 && c == '\n' && tok->prompt != NULL)
1186 blankline = 0; /* Let it through */
1187 else
1188 blankline = 1; /* Ignore completely */
1189 /* We can't jump back right here since we still
1190 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001192 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001193 if (col == tok->indstack[tok->indent]) {
1194 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001195 if (altcol != tok->altindstack[tok->indent]) {
1196 if (indenterror(tok))
1197 return ERRORTOKEN;
1198 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200 else if (col > tok->indstack[tok->indent]) {
1201 /* Indent -- always one */
1202 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001203 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001204 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205 return ERRORTOKEN;
1206 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001207 if (altcol <= tok->altindstack[tok->indent]) {
1208 if (indenterror(tok))
1209 return ERRORTOKEN;
1210 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001211 tok->pendin++;
1212 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001213 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001214 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001215 else /* col < tok->indstack[tok->indent] */ {
1216 /* Dedent -- any number, must be consistent */
1217 while (tok->indent > 0 &&
1218 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001219 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001220 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001221 }
1222 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001223 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001224 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001225 return ERRORTOKEN;
1226 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001227 if (altcol != tok->altindstack[tok->indent]) {
1228 if (indenterror(tok))
1229 return ERRORTOKEN;
1230 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 }
1232 }
1233 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001235 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001236
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 /* Return pending indents/dedents */
1238 if (tok->pendin != 0) {
1239 if (tok->pendin < 0) {
1240 tok->pendin++;
1241 return DEDENT;
1242 }
1243 else {
1244 tok->pendin--;
1245 return INDENT;
1246 }
1247 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001248
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001250 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 /* Skip spaces */
1252 do {
1253 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001254 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001255
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001257 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001258
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001259 /* Skip comment */
1260 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001261 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001262 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001263
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001265 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001267 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001268
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001270 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001271 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001272 /* Process b"", r"" and br"" */
1273 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001274 c = tok_nextc(tok);
1275 if (c == '"' || c == '\'')
1276 goto letter_quote;
1277 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001278 if (c == 'r' || c == 'R') {
1279 c = tok_nextc(tok);
1280 if (c == '"' || c == '\'')
1281 goto letter_quote;
1282 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001283 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001284 if (c >= 128)
1285 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001287 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001289 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001290 !verify_identifier(tok->start, tok->cur)) {
1291 tok->done = E_IDENTIFIER;
1292 return ERRORTOKEN;
1293 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001294 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 *p_end = tok->cur;
1296 return NAME;
1297 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001298
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 /* Newline */
1300 if (c == '\n') {
1301 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001302 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001303 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001304 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001306 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001307 return NEWLINE;
1308 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001309
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001310 /* Period or number starting with period? */
1311 if (c == '.') {
1312 c = tok_nextc(tok);
1313 if (isdigit(c)) {
1314 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001315 } else if (c == '.') {
1316 c = tok_nextc(tok);
1317 if (c == '.') {
1318 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001319 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001320 return ELLIPSIS;
1321 } else {
1322 tok_backup(tok, c);
1323 }
1324 tok_backup(tok, '.');
1325 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001326 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001327 }
Georg Brandldde00282007-03-18 19:01:53 +00001328 *p_start = tok->start;
1329 *p_end = tok->cur;
1330 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001331 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001332
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 /* Number */
1334 if (isdigit(c)) {
1335 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001336 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001337 c = tok_nextc(tok);
1338 if (c == '.')
1339 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001340#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001341 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001342 goto imaginary;
1343#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 if (c == 'x' || c == 'X') {
1345 /* Hex */
1346 do {
1347 c = tok_nextc(tok);
1348 } while (isxdigit(c));
1349 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001350 else if (c == 'o' || c == 'O') {
1351 /* Octal */
1352 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001354 } while ('0' <= c && c < '8');
1355 }
1356 else if (c == 'b' || c == 'B') {
1357 /* Binary */
1358 do {
1359 c = tok_nextc(tok);
1360 } while (c == '0' || c == '1');
1361 }
1362 else {
1363 int nonzero = 0;
1364 /* maybe old-style octal; c is first char of it */
1365 /* in any case, allow '0' as a literal */
1366 while (c == '0')
1367 c = tok_nextc(tok);
1368 while (isdigit(c)) {
1369 nonzero = 1;
1370 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001371 }
1372 if (c == '.')
1373 goto fraction;
1374 else if (c == 'e' || c == 'E')
1375 goto exponent;
1376#ifndef WITHOUT_COMPLEX
1377 else if (c == 'j' || c == 'J')
1378 goto imaginary;
1379#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001380 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
1384 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 }
1386 }
1387 else {
1388 /* Decimal */
1389 do {
1390 c = tok_nextc(tok);
1391 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001392 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001393 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001394 if (c == '.') {
1395 fraction:
1396 /* Fraction */
1397 do {
1398 c = tok_nextc(tok);
1399 } while (isdigit(c));
1400 }
1401 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001402 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001403 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001405 if (c == '+' || c == '-')
1406 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001407 if (!isdigit(c)) {
1408 tok->done = E_TOKEN;
1409 tok_backup(tok, c);
1410 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001411 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001412 do {
1413 c = tok_nextc(tok);
1414 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001416#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001417 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001418 /* Imaginary part */
1419 imaginary:
1420 c = tok_nextc(tok);
1421#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 }
1423 }
1424 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001425 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426 *p_end = tok->cur;
1427 return NUMBER;
1428 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001429
1430 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001431 /* String */
1432 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001433 int quote = c;
1434 int quote_size = 1; /* 1 or 3 */
1435 int end_quote_size = 0;
1436
1437 /* Find the quote size and start of string */
1438 c = tok_nextc(tok);
1439 if (c == quote) {
1440 c = tok_nextc(tok);
1441 if (c == quote)
1442 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001443 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001444 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001445 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001446 if (c != quote)
1447 tok_backup(tok, c);
1448
1449 /* Get rest of string */
1450 while (end_quote_size != quote_size) {
1451 c = tok_nextc(tok);
1452 if (c == EOF) {
1453 if (quote_size == 3)
1454 tok->done = E_EOFS;
1455 else
1456 tok->done = E_EOLS;
1457 tok->cur = tok->inp;
1458 return ERRORTOKEN;
1459 }
1460 if (quote_size == 1 && c == '\n') {
1461 tok->done = E_EOLS;
1462 tok->cur = tok->inp;
1463 return ERRORTOKEN;
1464 }
1465 if (c == quote)
1466 end_quote_size += 1;
1467 else {
1468 end_quote_size = 0;
1469 if (c == '\\')
1470 c = tok_nextc(tok); /* skip escaped char */
1471 }
1472 }
1473
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001474 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001475 *p_end = tok->cur;
1476 return STRING;
1477 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001478
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479 /* Line continuation */
1480 if (c == '\\') {
1481 c = tok_nextc(tok);
1482 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001483 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001484 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001485 return ERRORTOKEN;
1486 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001487 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001488 goto again; /* Read next line */
1489 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001490
Guido van Rossumfbab9051991-10-20 20:25:03 +00001491 /* Check for two-character token */
1492 {
1493 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001494 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001495 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001496 int c3 = tok_nextc(tok);
1497 int token3 = PyToken_ThreeChars(c, c2, c3);
1498 if (token3 != OP) {
1499 token = token3;
1500 } else {
1501 tok_backup(tok, c3);
1502 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001503 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001504 *p_end = tok->cur;
1505 return token;
1506 }
1507 tok_backup(tok, c2);
1508 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001509
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001510 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001511 switch (c) {
1512 case '(':
1513 case '[':
1514 case '{':
1515 tok->level++;
1516 break;
1517 case ')':
1518 case ']':
1519 case '}':
1520 tok->level--;
1521 break;
1522 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001523
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001524 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001525 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001527 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001528}
1529
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001530int
1531PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1532{
1533 int result = tok_get(tok, p_start, p_end);
1534 if (tok->decoding_erred) {
1535 result = ERRORTOKEN;
1536 tok->done = E_DECODE;
1537 }
1538 return result;
1539}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001540
Thomas Wouters89d996e2007-09-08 17:39:28 +00001541/* This function is only called from parsetok. However, it cannot live
1542 there, as it must be empty for PGEN, and we can check for PGEN only
1543 in this file. */
1544
1545#ifdef PGEN
1546char*
1547PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1548{
1549 return NULL;
1550}
1551#else
1552static PyObject *
1553dec_utf8(const char *enc, const char *text, size_t len) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001554 PyObject *ret = NULL;
Thomas Wouters89d996e2007-09-08 17:39:28 +00001555 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1556 if (unicode_text) {
1557 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1558 Py_DECREF(unicode_text);
1559 }
1560 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001561 PyErr_Clear();
1562 }
1563 else {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001564 assert(PyString_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001565 }
1566 return ret;
1567}
1568
1569char *
1570PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1571{
1572 char *text = NULL;
1573 if (tok->encoding) {
1574 /* convert source to original encondig */
1575 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1576 if (lineobj != NULL) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 int linelen = PyString_GET_SIZE(lineobj);
1578 const char *line = PyString_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001579 text = PyObject_MALLOC(linelen + 1);
1580 if (text != NULL && line != NULL) {
1581 if (linelen)
1582 strncpy(text, line, linelen);
1583 text[linelen] = '\0';
1584 }
1585 Py_DECREF(lineobj);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001586
Thomas Wouters89d996e2007-09-08 17:39:28 +00001587 /* adjust error offset */
1588 if (*offset > 1) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001589 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001590 tok->buf,
1591 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001592 if (offsetobj) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001593 *offset = 1 + Py_Size(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001594 Py_DECREF(offsetobj);
1595 }
1596 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001597
Thomas Wouters89d996e2007-09-08 17:39:28 +00001598 }
1599 }
1600 return text;
1601
1602}
1603#endif
1604
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001605/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001606
1607 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001608 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001609 should be assumed to be PyUnicode_GetDefaultEncoding()).
1610
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001611 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1612 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001613*/
1614char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001615PyTokenizer_FindEncoding(int fd)
1616{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001617 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001618 FILE *fp;
1619 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001620
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001621 fd = dup(fd);
1622 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001623 return NULL;
1624 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001625 fp = fdopen(fd, "r");
1626 if (fp == NULL) {
1627 return NULL;
1628 }
1629 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1630 if (tok == NULL) {
1631 fclose(fp);
1632 return NULL;
1633 }
1634 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001635 PyTokenizer_Get(tok, &p_start, &p_end);
1636 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001637 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001638 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001639 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001640 strcpy(encoding, tok->encoding);
1641 }
1642 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001643 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001644}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001645
Guido van Rossum408027e1996-12-30 16:17:54 +00001646#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001647
1648void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001649tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001650{
Guido van Rossum86bea461997-04-29 21:03:06 +00001651 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001652 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1653 printf("(%.*s)", (int)(end - start), start);
1654}
1655
1656#endif