blob: 3733f4901764707817f02a72d9a0bdea0db65418 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Brett Cannond5ec98c2007-10-20 02:54:14 +000055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056/* Token names */
57
Guido van Rossum86bea461997-04-29 21:03:06 +000058char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059 "ENDMARKER",
60 "NAME",
61 "NUMBER",
62 "STRING",
63 "NEWLINE",
64 "INDENT",
65 "DEDENT",
66 "LPAR",
67 "RPAR",
68 "LSQB",
69 "RSQB",
70 "COLON",
71 "COMMA",
72 "SEMI",
73 "PLUS",
74 "MINUS",
75 "STAR",
76 "SLASH",
77 "VBAR",
78 "AMPER",
79 "LESS",
80 "GREATER",
81 "EQUAL",
82 "DOT",
83 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "LBRACE",
85 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 "EQEQUAL",
87 "NOTEQUAL",
88 "LESSEQUAL",
89 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000090 "TILDE",
91 "CIRCUMFLEX",
92 "LEFTSHIFT",
93 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000094 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000095 "PLUSEQUAL",
96 "MINEQUAL",
97 "STAREQUAL",
98 "SLASHEQUAL",
99 "PERCENTEQUAL",
100 "AMPEREQUAL",
101 "VBAREQUAL",
102 "CIRCUMFLEXEQUAL",
103 "LEFTSHIFTEQUAL",
104 "RIGHTSHIFTEQUAL",
105 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000106 "DOUBLESLASH",
107 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000108 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000109 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000110 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000111 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 "OP",
113 "<ERRORTOKEN>",
114 "<N_TOKENS>"
115};
116
117
118/* Create and initialize a new tok_state structure */
119
120static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000123 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
124 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 if (tok == NULL)
126 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000127 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 tok->done = E_OK;
129 tok->fp = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000137 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000143 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000149 tok->decoding_readline = NULL;
150 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000151#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000152 return tok;
153}
154
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000155#ifdef PGEN
156
157static char *
158decoding_fgets(char *s, int size, struct tok_state *tok)
159{
160 return fgets(s, size, tok->fp);
161}
162
163static int
164decoding_feof(struct tok_state *tok)
165{
166 return feof(tok->fp);
167}
168
169static const char *
170decode_str(const char *str, struct tok_state *tok)
171{
172 return str;
173}
174
175#else /* PGEN */
176
177static char *
178error_ret(struct tok_state *tok) /* XXX */
179{
180 tok->decoding_erred = 1;
181 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 tok->buf = NULL;
184 return NULL; /* as if it were EOF */
185}
186
187static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 if (result != NULL) {
192 memcpy(result, s, len);
193 result[len] = '\0';
194 }
195 return result;
196}
197
198static char *
199get_normal_name(char *s) /* for utf-8 and latin-1 */
200{
201 char buf[13];
202 int i;
203 for (i = 0; i < 12; i++) {
204 int c = s[i];
205 if (c == '\0') break;
206 else if (c == '_') buf[i] = '-';
207 else buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
212 else if (strcmp(buf, "latin-1") == 0 ||
213 strcmp(buf, "iso-8859-1") == 0 ||
214 strcmp(buf, "iso-latin-1") == 0 ||
215 strncmp(buf, "latin-1-", 8) == 0 ||
216 strncmp(buf, "iso-8859-1-", 11) == 0 ||
217 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
218 else return s;
219}
220
221/* Return the coding spec in S, or NULL if none is found. */
222
223static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000224get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000225{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000226 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000227 /* Coding spec must be in a comment, and that comment must be
228 * the only statement on the source code line. */
229 for (i = 0; i < size - 6; i++) {
230 if (s[i] == '#')
231 break;
232 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
233 return NULL;
234 }
235 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236 const char* t = s + i;
237 if (strncmp(t, "coding", 6) == 0) {
238 const char* begin = NULL;
239 t += 6;
240 if (t[0] != ':' && t[0] != '=')
241 continue;
242 do {
243 t++;
244 } while (t[0] == '\x20' || t[0] == '\t');
245
246 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000247 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000248 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 t++;
250
251 if (begin < t) {
252 char* r = new_string(begin, t - begin);
253 char* q = get_normal_name(r);
254 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000256 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 }
258 return r;
259 }
260 }
261 }
262 return NULL;
263}
264
265/* Check whether the line contains a coding spec. If it does,
266 invoke the set_readline function for the new encoding.
267 This function receives the tok_state and the new encoding.
268 Return 1 on success, 0 on failure. */
269
270static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000271check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int set_readline(struct tok_state *, const char *))
273{
Tim Peters17db21f2002-09-03 15:39:58 +0000274 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000276
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000277 if (tok->cont_line)
278 /* It's a continuation line, so it can't be a coding spec. */
279 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000280 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 if (cs != NULL) {
282 tok->read_coding_spec = 1;
283 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 if (strcmp(cs, "utf-8") == 0 ||
286 strcmp(cs, "iso-8859-1") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000326 ch = get_char(tok);
327 if (ch != 0xBB) {
328 unget_char(ch, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
333 ch = get_char(tok);
334 if (ch != 0xBF) {
335 unget_char(ch, tok);
336 unget_char(0xBB, tok);
337 unget_char(0xEF, tok);
338 /* any token beginning with '\xEF' is a bad token */
339 return 1;
340 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341#if 0
342 /* Disable support for UTF-16 BOMs until a decision
343 is made whether this needs to be supported. */
344 } else if (ch == 0xFE) {
345 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
346 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000347 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 } else if (ch == 0xFF) {
349 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
350 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000351 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352#endif
353 } else {
354 unget_char(ch, tok);
355 return 1;
356 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000357 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000359 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000360 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361 return 1;
362}
363
364/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367 On entry, tok->decoding_buffer will be one of:
368 1) NULL: need to call tok->decoding_readline to get a new line
369 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
370 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000371 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 (in the s buffer) to copy entire contents of the line read
373 by tok->decoding_readline. tok->decoding_buffer has the overflow.
374 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 reached): see tok_nextc and its calls to decoding_fgets.
377*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378
379static char *
380fp_readl(char *s, int size, struct tok_state *tok)
381{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000383 const char *buf;
384 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385
386 /* Ask for one less byte so we can terminate it */
387 assert(size > 0);
388 size--;
389
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000390 if (tok->decoding_buffer) {
391 bufobj = tok->decoding_buffer;
392 Py_INCREF(bufobj);
393 }
394 else
395 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000396 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
397 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000400 if (PyUnicode_CheckExact(bufobj))
401 {
402 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
403 if (buf == NULL) {
404 goto error;
405 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000406 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000407 else
408 {
409 buf = PyBytes_AsString(bufobj);
410 if (buf == NULL) {
411 goto error;
412 }
413 buflen = PyBytes_GET_SIZE(bufobj);
414 }
415
416 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000417 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 /* Too many chars, the rest goes into tok->decoding_buffer */
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000419 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
420 buflen-size);
421 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000423 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 else
426 tok->decoding_buffer = NULL;
427
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 memcpy(s, buf, buflen);
429 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000430 if (buflen == 0) /* EOF */
431 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000432 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000434
435error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000437 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000438}
439
440/* Set the readline function for TOK to a StreamReader's
441 readline function. The StreamReader is named ENC.
442
443 This function is called from check_bom and check_coding_spec.
444
445 ENC is usually identical to the future value of tok->encoding,
446 except for the (currently unsupported) case of UTF-16.
447
448 Return 1 on success, 0 on failure. */
449
450static int
451fp_setreadl(struct tok_state *tok, const char* enc)
452{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000453 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
Christian Heimes819b8bf2008-01-03 23:05:47 +0000455 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456 if (io == NULL)
457 goto cleanup;
458
459 stream = PyObject_CallMethod(io, "open", "ssis",
460 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000461 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000464 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000465 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467
468 cleanup:
469 Py_XDECREF(stream);
470 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000471 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472}
473
474/* Fetch the next byte from TOK. */
475
476static int fp_getc(struct tok_state *tok) {
477 return getc(tok->fp);
478}
479
480/* Unfetch the last byte back into TOK. */
481
482static void fp_ungetc(int c, struct tok_state *tok) {
483 ungetc(c, tok->fp);
484}
485
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000486/* Check whether the characters at s start a valid
487 UTF-8 sequence. Return the number of characters forming
488 the sequence if yes, 0 if not. */
489static int valid_utf8(const unsigned char* s)
490{
491 int expected = 0;
492 int length;
493 if (*s < 0x80)
494 /* single-byte code */
495 return 1;
496 if (*s < 0xc0)
497 /* following byte */
498 return 0;
499 if (*s < 0xE0)
500 expected = 1;
501 else if (*s < 0xF0)
502 expected = 2;
503 else if (*s < 0xF8)
504 expected = 3;
505 else
506 return 0;
507 length = expected + 1;
508 for (; expected; expected--)
509 if (s[expected] < 0x80 || s[expected] >= 0xC0)
510 return 0;
511 return length;
512}
513
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514/* Read a line of input from TOK. Determine encoding
515 if necessary. */
516
517static char *
518decoding_fgets(char *s, int size, struct tok_state *tok)
519{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000520 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000521 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000522 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000523 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 /* We already have a codec associated with
525 this input. */
526 line = fp_readl(s, size, tok);
527 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000528 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000530 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 break;
533 } else {
534 /* We have not yet determined the encoding.
535 If an encoding is found, use the file-pointer
536 reader functions from now on. */
537 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
538 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000539 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000541 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
543 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
544 return error_ret(tok);
545 }
546 }
547#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000548 /* The default encoding is UTF-8, so make sure we don't have any
549 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000550 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000552 int length;
553 for (c = (unsigned char *)line; *c; c += length)
554 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 badchar = *c;
556 break;
557 }
558 }
559 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000560 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000561 /* Need to add 1 to the line number, since this line
562 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000563 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000564 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000565 "in file %.200s on line %i, "
566 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000567 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000568 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000569 PyErr_SetString(PyExc_SyntaxError, buf);
570 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 }
572#endif
573 return line;
574}
575
576static int
577decoding_feof(struct tok_state *tok)
578{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000579 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 return feof(tok->fp);
581 } else {
582 PyObject* buf = tok->decoding_buffer;
583 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000584 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 if (buf == NULL) {
586 error_ret(tok);
587 return 1;
588 } else {
589 tok->decoding_buffer = buf;
590 }
591 }
592 return PyObject_Length(buf) == 0;
593 }
594}
595
596/* Fetch a byte from TOK, using the string buffer. */
597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000598static int
599buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000600 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Unfetch a byte from TOK, using the string buffer. */
604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605static void
606buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000608 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609}
610
611/* Set the readline function for TOK to ENC. For the string-based
612 tokenizer, this means to just record the encoding. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000616 tok->enc = enc;
617 return 1;
618}
619
620/* Return a UTF-8 encoding Python string object from the
621 C byte string STR, which is encoded with ENC. */
622
623static PyObject *
624translate_into_utf8(const char* str, const char* enc) {
625 PyObject *utf8;
626 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
627 if (buf == NULL)
628 return NULL;
629 utf8 = PyUnicode_AsUTF8String(buf);
630 Py_DECREF(buf);
631 return utf8;
632}
633
634/* Decode a byte string STR for use as the buffer of TOK.
635 Look for encoding declarations inside STR, and record them
636 inside TOK. */
637
638static const char *
639decode_str(const char *str, struct tok_state *tok)
640{
641 PyObject* utf8 = NULL;
642 const char *s;
643 int lineno = 0;
644 tok->enc = NULL;
645 tok->str = str;
646 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000647 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000649 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 if (tok->enc != NULL) {
651 utf8 = translate_into_utf8(str, tok->enc);
652 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000653 return error_ret(tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000654 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655 }
656 for (s = str;; s++) {
657 if (*s == '\0') break;
658 else if (*s == '\n') {
659 lineno++;
660 if (lineno == 2) break;
661 }
662 }
663 tok->enc = NULL;
664 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000665 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666 if (tok->enc != NULL) {
667 assert(utf8 == NULL);
668 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000669 if (utf8 == NULL) {
670 PyErr_Format(PyExc_SyntaxError,
671 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000672 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000673 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000674 str = PyString_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675 }
676 assert(tok->decoding_buffer == NULL);
677 tok->decoding_buffer = utf8; /* CAUTION */
678 return str;
679}
680
681#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682
683/* Set up tokenizer for string */
684
685struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000686PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687{
688 struct tok_state *tok = tok_new();
689 if (tok == NULL)
690 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000692 if (str == NULL) {
693 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000694 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000695 }
696
Martin v. Löwis95292d62002-12-11 14:04:59 +0000697 /* XXX: constify members. */
698 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000699 return tok;
700}
701
702
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000703/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000704
705struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000706PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000707{
708 struct tok_state *tok = tok_new();
709 if (tok == NULL)
710 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000711 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000712 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000713 return NULL;
714 }
715 tok->cur = tok->inp = tok->buf;
716 tok->end = tok->buf + BUFSIZ;
717 tok->fp = fp;
718 tok->prompt = ps1;
719 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000720 if (enc != NULL) {
721 /* Must copy encoding declaration since it
722 gets copied into the parse tree. */
723 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
724 if (!tok->encoding) {
725 PyTokenizer_Free(tok);
726 return NULL;
727 }
728 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000729 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000730 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731 return tok;
732}
733
734
735/* Free a tok_state structure */
736
737void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000738PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000740 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000741 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000742#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000743 Py_XDECREF(tok->decoding_readline);
744 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000745#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000747 PyMem_FREE(tok->buf);
748 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749}
750
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751/* Get next char, updating state; error code goes into tok->done */
752
753static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000754tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000757 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000758 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000759 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000760 if (tok->done != E_OK)
761 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000763 char *end = strchr(tok->inp, '\n');
764 if (end != NULL)
765 end++;
766 else {
767 end = strchr(tok->inp, '\0');
768 if (end == tok->inp) {
769 tok->done = E_EOF;
770 return EOF;
771 }
772 }
773 if (tok->start == NULL)
774 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000775 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000776 tok->lineno++;
777 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000778 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000782#ifndef PGEN
783 if (tok->encoding && newtok && *newtok) {
784 /* Recode to UTF-8 */
785 Py_ssize_t buflen;
786 const char* buf;
787 PyObject *u = translate_into_utf8(newtok, tok->encoding);
788 PyMem_FREE(newtok);
789 if (!u) {
790 tok->done = E_DECODE;
791 return EOF;
792 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000793 buflen = PyString_GET_SIZE(u);
794 buf = PyString_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000795 if (!buf) {
796 Py_DECREF(u);
797 tok->done = E_DECODE;
798 return EOF;
799 }
800 newtok = PyMem_MALLOC(buflen+1);
801 strcpy(newtok, buf);
802 Py_DECREF(u);
803 }
804#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805 if (tok->nextprompt != NULL)
806 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000807 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000809 else if (*newtok == '\0') {
810 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000811 tok->done = E_EOF;
812 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000813 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000814 size_t start = tok->start - tok->buf;
815 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000816 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000817 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000818 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->lineno++;
820 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000822 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000823 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000824 tok->done = E_NOMEM;
825 return EOF;
826 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 tok->buf = buf;
828 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000829 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000830 strcpy(tok->buf + oldlen, newtok);
831 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 tok->inp = tok->buf + newlen;
833 tok->end = tok->inp + 1;
834 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000835 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 else {
837 tok->lineno++;
838 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000839 PyMem_FREE(tok->buf);
840 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000841 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000843 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 tok->inp = strchr(tok->buf, '\0');
845 tok->end = tok->inp + 1;
846 }
847 }
848 else {
849 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000850 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000851 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000852 if (tok->start == NULL) {
853 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000854 tok->buf = (char *)
855 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 if (tok->buf == NULL) {
857 tok->done = E_NOMEM;
858 return EOF;
859 }
860 tok->end = tok->buf + BUFSIZ;
861 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000862 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
863 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 tok->done = E_EOF;
865 done = 1;
866 }
867 else {
868 tok->done = E_OK;
869 tok->inp = strchr(tok->buf, '\0');
870 done = tok->inp[-1] == '\n';
871 }
872 }
873 else {
874 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000875 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000876 tok->done = E_EOF;
877 done = 1;
878 }
879 else
880 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000881 }
882 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000883 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000884 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000885 Py_ssize_t curstart = tok->start == NULL ? -1 :
886 tok->start - tok->buf;
887 Py_ssize_t curvalid = tok->inp - tok->buf;
888 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000889 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000890 newbuf = (char *)PyMem_REALLOC(newbuf,
891 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000892 if (newbuf == NULL) {
893 tok->done = E_NOMEM;
894 tok->cur = tok->inp;
895 return EOF;
896 }
897 tok->buf = newbuf;
898 tok->inp = tok->buf + curvalid;
899 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000900 tok->start = curstart < 0 ? NULL :
901 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000902 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000903 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000904 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000905 /* Break out early on decoding
906 errors, as tok->buf will be NULL
907 */
908 if (tok->decoding_erred)
909 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000910 /* Last line does not end in \n,
911 fake one */
912 strcpy(tok->inp, "\n");
913 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000914 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000915 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000916 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000917 if (tok->buf != NULL) {
918 tok->cur = tok->buf + cur;
919 tok->line_start = tok->cur;
920 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000921 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000922 pt = tok->inp - 2;
923 if (pt >= tok->buf && *pt == '\r') {
924 *pt++ = '\n';
925 *pt = '\0';
926 tok->inp = pt;
927 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000928 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000929 }
930 if (tok->done != E_OK) {
931 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000932 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000933 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000934 return EOF;
935 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000936 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000937 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000938}
939
940
941/* Back-up one character */
942
943static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000944tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000945{
946 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000947 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000948 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949 if (*tok->cur != c)
950 *tok->cur = c;
951 }
952}
953
954
955/* Return the token corresponding to a single character */
956
957int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000958PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000959{
960 switch (c) {
961 case '(': return LPAR;
962 case ')': return RPAR;
963 case '[': return LSQB;
964 case ']': return RSQB;
965 case ':': return COLON;
966 case ',': return COMMA;
967 case ';': return SEMI;
968 case '+': return PLUS;
969 case '-': return MINUS;
970 case '*': return STAR;
971 case '/': return SLASH;
972 case '|': return VBAR;
973 case '&': return AMPER;
974 case '<': return LESS;
975 case '>': return GREATER;
976 case '=': return EQUAL;
977 case '.': return DOT;
978 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000979 case '{': return LBRACE;
980 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000981 case '^': return CIRCUMFLEX;
982 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000983 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000984 default: return OP;
985 }
986}
987
988
Guido van Rossumfbab9051991-10-20 20:25:03 +0000989int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000990PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000991{
992 switch (c1) {
993 case '=':
994 switch (c2) {
995 case '=': return EQEQUAL;
996 }
997 break;
998 case '!':
999 switch (c2) {
1000 case '=': return NOTEQUAL;
1001 }
1002 break;
1003 case '<':
1004 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +00001005 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001006 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001007 }
1008 break;
1009 case '>':
1010 switch (c2) {
1011 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001012 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001013 }
1014 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001015 case '+':
1016 switch (c2) {
1017 case '=': return PLUSEQUAL;
1018 }
1019 break;
1020 case '-':
1021 switch (c2) {
1022 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001023 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001024 }
1025 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001026 case '*':
1027 switch (c2) {
1028 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001029 case '=': return STAREQUAL;
1030 }
1031 break;
1032 case '/':
1033 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001034 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001035 case '=': return SLASHEQUAL;
1036 }
1037 break;
1038 case '|':
1039 switch (c2) {
1040 case '=': return VBAREQUAL;
1041 }
1042 break;
1043 case '%':
1044 switch (c2) {
1045 case '=': return PERCENTEQUAL;
1046 }
1047 break;
1048 case '&':
1049 switch (c2) {
1050 case '=': return AMPEREQUAL;
1051 }
1052 break;
1053 case '^':
1054 switch (c2) {
1055 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001056 }
1057 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001058 }
1059 return OP;
1060}
1061
Thomas Wouters434d0822000-08-24 20:11:32 +00001062int
1063PyToken_ThreeChars(int c1, int c2, int c3)
1064{
1065 switch (c1) {
1066 case '<':
1067 switch (c2) {
1068 case '<':
1069 switch (c3) {
1070 case '=':
1071 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001072 }
1073 break;
1074 }
1075 break;
1076 case '>':
1077 switch (c2) {
1078 case '>':
1079 switch (c3) {
1080 case '=':
1081 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001082 }
1083 break;
1084 }
1085 break;
1086 case '*':
1087 switch (c2) {
1088 case '*':
1089 switch (c3) {
1090 case '=':
1091 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001092 }
1093 break;
1094 }
1095 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001096 case '/':
1097 switch (c2) {
1098 case '/':
1099 switch (c3) {
1100 case '=':
1101 return DOUBLESLASHEQUAL;
1102 }
1103 break;
1104 }
1105 break;
Georg Brandldde00282007-03-18 19:01:53 +00001106 case '.':
1107 switch (c2) {
1108 case '.':
1109 switch (c3) {
1110 case '.':
1111 return ELLIPSIS;
1112 }
1113 break;
1114 }
1115 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001116 }
1117 return OP;
1118}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001119
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001121indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001122{
1123 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001124 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 tok->cur = tok->inp;
1126 return 1;
1127 }
1128 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001129 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1130 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001131 tok->altwarning = 0;
1132 }
1133 return 0;
1134}
1135
Martin v. Löwis47383402007-08-15 07:32:56 +00001136#ifdef PGEN
1137#define verify_identifier(s,e) 1
1138#else
1139/* Verify that the identifier follows PEP 3131. */
1140static int
1141verify_identifier(char *start, char *end)
1142{
Guido van Rossume3e37012007-08-29 18:54:41 +00001143 PyObject *s;
1144 int result;
1145 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1146 if (s == NULL) {
1147 PyErr_Clear();
1148 return 0;
1149 }
1150 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001151 Py_DECREF(s);
1152 return result;
1153}
1154#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001156/* Get next token, after space stripping etc. */
1157
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001158static int
1159tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160{
1161 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001162 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001164 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001165 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001166 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001167 blankline = 0;
1168
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001169 /* Get indentation level */
1170 if (tok->atbol) {
1171 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001172 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001173 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001174 for (;;) {
1175 c = tok_nextc(tok);
1176 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001177 col++, altcol++;
1178 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001180 altcol = (altcol/tok->alttabsize + 1)
1181 * tok->alttabsize;
1182 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001183 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001184 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 else
1186 break;
1187 }
1188 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001189 if (c == '#' || c == '\n') {
1190 /* Lines with only whitespace and/or comments
1191 shouldn't affect the indentation and are
1192 not passed to the parser as NEWLINE tokens,
1193 except *totally* empty lines in interactive
1194 mode, which signal the end of a command group. */
1195 if (col == 0 && c == '\n' && tok->prompt != NULL)
1196 blankline = 0; /* Let it through */
1197 else
1198 blankline = 1; /* Ignore completely */
1199 /* We can't jump back right here since we still
1200 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001201 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001202 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001203 if (col == tok->indstack[tok->indent]) {
1204 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001205 if (altcol != tok->altindstack[tok->indent]) {
1206 if (indenterror(tok))
1207 return ERRORTOKEN;
1208 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001210 else if (col > tok->indstack[tok->indent]) {
1211 /* Indent -- always one */
1212 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001213 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001214 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001215 return ERRORTOKEN;
1216 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001217 if (altcol <= tok->altindstack[tok->indent]) {
1218 if (indenterror(tok))
1219 return ERRORTOKEN;
1220 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001221 tok->pendin++;
1222 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001223 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001225 else /* col < tok->indstack[tok->indent] */ {
1226 /* Dedent -- any number, must be consistent */
1227 while (tok->indent > 0 &&
1228 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001229 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001230 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001231 }
1232 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001233 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001234 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001235 return ERRORTOKEN;
1236 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001237 if (altcol != tok->altindstack[tok->indent]) {
1238 if (indenterror(tok))
1239 return ERRORTOKEN;
1240 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001241 }
1242 }
1243 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001245 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001246
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001247 /* Return pending indents/dedents */
1248 if (tok->pendin != 0) {
1249 if (tok->pendin < 0) {
1250 tok->pendin++;
1251 return DEDENT;
1252 }
1253 else {
1254 tok->pendin--;
1255 return INDENT;
1256 }
1257 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001258
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001260 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 /* Skip spaces */
1262 do {
1263 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001264 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001265
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001267 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001268
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001269 /* Skip comment */
1270 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001271 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001273
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001274 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001275 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001276 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001277 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001278
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001280 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001281 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001282 /* Process b"", r"" and br"" */
1283 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001284 c = tok_nextc(tok);
1285 if (c == '"' || c == '\'')
1286 goto letter_quote;
1287 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001288 if (c == 'r' || c == 'R') {
1289 c = tok_nextc(tok);
1290 if (c == '"' || c == '\'')
1291 goto letter_quote;
1292 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001293 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001294 if (c >= 128)
1295 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001297 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001299 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001300 !verify_identifier(tok->start, tok->cur)) {
1301 tok->done = E_IDENTIFIER;
1302 return ERRORTOKEN;
1303 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001304 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305 *p_end = tok->cur;
1306 return NAME;
1307 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001308
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 /* Newline */
1310 if (c == '\n') {
1311 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001312 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001313 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001314 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001315 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001316 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 return NEWLINE;
1318 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001319
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001320 /* Period or number starting with period? */
1321 if (c == '.') {
1322 c = tok_nextc(tok);
1323 if (isdigit(c)) {
1324 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001325 } else if (c == '.') {
1326 c = tok_nextc(tok);
1327 if (c == '.') {
1328 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001329 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001330 return ELLIPSIS;
1331 } else {
1332 tok_backup(tok, c);
1333 }
1334 tok_backup(tok, '.');
1335 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001336 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001337 }
Georg Brandldde00282007-03-18 19:01:53 +00001338 *p_start = tok->start;
1339 *p_end = tok->cur;
1340 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001341 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001342
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001343 /* Number */
1344 if (isdigit(c)) {
1345 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001346 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 c = tok_nextc(tok);
1348 if (c == '.')
1349 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001350#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001351 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001352 goto imaginary;
1353#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001355
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001356 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001357 c = tok_nextc(tok);
1358 if (!isxdigit(c)) {
1359 tok->done = E_TOKEN;
1360 tok_backup(tok, c);
1361 return ERRORTOKEN;
1362 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 do {
1364 c = tok_nextc(tok);
1365 } while (isxdigit(c));
1366 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001367 else if (c == 'o' || c == 'O') {
1368 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001369 c = tok_nextc(tok);
1370 if (c < '0' || c > '8') {
1371 tok->done = E_TOKEN;
1372 tok_backup(tok, c);
1373 return ERRORTOKEN;
1374 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001375 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001377 } while ('0' <= c && c < '8');
1378 }
1379 else if (c == 'b' || c == 'B') {
1380 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001381 c = tok_nextc(tok);
1382 if (c != '0' && c != '1') {
1383 tok->done = E_TOKEN;
1384 tok_backup(tok, c);
1385 return ERRORTOKEN;
1386 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001387 do {
1388 c = tok_nextc(tok);
1389 } while (c == '0' || c == '1');
1390 }
1391 else {
1392 int nonzero = 0;
1393 /* maybe old-style octal; c is first char of it */
1394 /* in any case, allow '0' as a literal */
1395 while (c == '0')
1396 c = tok_nextc(tok);
1397 while (isdigit(c)) {
1398 nonzero = 1;
1399 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001400 }
1401 if (c == '.')
1402 goto fraction;
1403 else if (c == 'e' || c == 'E')
1404 goto exponent;
1405#ifndef WITHOUT_COMPLEX
1406 else if (c == 'j' || c == 'J')
1407 goto imaginary;
1408#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001409 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001410 tok->done = E_TOKEN;
1411 tok_backup(tok, c);
1412 return ERRORTOKEN;
1413 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 }
1415 }
1416 else {
1417 /* Decimal */
1418 do {
1419 c = tok_nextc(tok);
1420 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001421 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001422 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001423 if (c == '.') {
1424 fraction:
1425 /* Fraction */
1426 do {
1427 c = tok_nextc(tok);
1428 } while (isdigit(c));
1429 }
1430 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001431 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001432 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001433 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001434 if (c == '+' || c == '-')
1435 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001436 if (!isdigit(c)) {
1437 tok->done = E_TOKEN;
1438 tok_backup(tok, c);
1439 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001440 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001441 do {
1442 c = tok_nextc(tok);
1443 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001445#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001446 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001447 /* Imaginary part */
1448 imaginary:
1449 c = tok_nextc(tok);
1450#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001451 }
1452 }
1453 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001454 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 *p_end = tok->cur;
1456 return NUMBER;
1457 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001458
1459 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 /* String */
1461 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001462 int quote = c;
1463 int quote_size = 1; /* 1 or 3 */
1464 int end_quote_size = 0;
1465
1466 /* Find the quote size and start of string */
1467 c = tok_nextc(tok);
1468 if (c == quote) {
1469 c = tok_nextc(tok);
1470 if (c == quote)
1471 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001472 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001473 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001475 if (c != quote)
1476 tok_backup(tok, c);
1477
1478 /* Get rest of string */
1479 while (end_quote_size != quote_size) {
1480 c = tok_nextc(tok);
1481 if (c == EOF) {
1482 if (quote_size == 3)
1483 tok->done = E_EOFS;
1484 else
1485 tok->done = E_EOLS;
1486 tok->cur = tok->inp;
1487 return ERRORTOKEN;
1488 }
1489 if (quote_size == 1 && c == '\n') {
1490 tok->done = E_EOLS;
1491 tok->cur = tok->inp;
1492 return ERRORTOKEN;
1493 }
1494 if (c == quote)
1495 end_quote_size += 1;
1496 else {
1497 end_quote_size = 0;
1498 if (c == '\\')
1499 c = tok_nextc(tok); /* skip escaped char */
1500 }
1501 }
1502
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001503 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001504 *p_end = tok->cur;
1505 return STRING;
1506 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001507
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001508 /* Line continuation */
1509 if (c == '\\') {
1510 c = tok_nextc(tok);
1511 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001512 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001513 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001514 return ERRORTOKEN;
1515 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001516 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001517 goto again; /* Read next line */
1518 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001519
Guido van Rossumfbab9051991-10-20 20:25:03 +00001520 /* Check for two-character token */
1521 {
1522 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001523 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001524 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001525 int c3 = tok_nextc(tok);
1526 int token3 = PyToken_ThreeChars(c, c2, c3);
1527 if (token3 != OP) {
1528 token = token3;
1529 } else {
1530 tok_backup(tok, c3);
1531 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001532 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001533 *p_end = tok->cur;
1534 return token;
1535 }
1536 tok_backup(tok, c2);
1537 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001538
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001539 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001540 switch (c) {
1541 case '(':
1542 case '[':
1543 case '{':
1544 tok->level++;
1545 break;
1546 case ')':
1547 case ']':
1548 case '}':
1549 tok->level--;
1550 break;
1551 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001552
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001553 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001554 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001555 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001556 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001557}
1558
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001559int
1560PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1561{
1562 int result = tok_get(tok, p_start, p_end);
1563 if (tok->decoding_erred) {
1564 result = ERRORTOKEN;
1565 tok->done = E_DECODE;
1566 }
1567 return result;
1568}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001569
Thomas Wouters89d996e2007-09-08 17:39:28 +00001570/* This function is only called from parsetok. However, it cannot live
1571 there, as it must be empty for PGEN, and we can check for PGEN only
1572 in this file. */
1573
1574#ifdef PGEN
1575char*
1576PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1577{
1578 return NULL;
1579}
1580#else
1581static PyObject *
1582dec_utf8(const char *enc, const char *text, size_t len) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001583 PyObject *ret = NULL;
Thomas Wouters89d996e2007-09-08 17:39:28 +00001584 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1585 if (unicode_text) {
1586 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1587 Py_DECREF(unicode_text);
1588 }
1589 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001590 PyErr_Clear();
1591 }
1592 else {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001593 assert(PyString_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001594 }
1595 return ret;
1596}
1597
1598char *
1599PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1600{
1601 char *text = NULL;
1602 if (tok->encoding) {
1603 /* convert source to original encondig */
1604 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1605 if (lineobj != NULL) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001606 int linelen = PyString_GET_SIZE(lineobj);
1607 const char *line = PyString_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001608 text = PyObject_MALLOC(linelen + 1);
1609 if (text != NULL && line != NULL) {
1610 if (linelen)
1611 strncpy(text, line, linelen);
1612 text[linelen] = '\0';
1613 }
1614 Py_DECREF(lineobj);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001615
Thomas Wouters89d996e2007-09-08 17:39:28 +00001616 /* adjust error offset */
1617 if (*offset > 1) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001618 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001619 tok->buf,
1620 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001621 if (offsetobj) {
Christian Heimes90aa7642007-12-19 02:45:37 +00001622 *offset = 1 + Py_SIZE(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001623 Py_DECREF(offsetobj);
1624 }
1625 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001626
Thomas Wouters89d996e2007-09-08 17:39:28 +00001627 }
1628 }
1629 return text;
1630
1631}
1632#endif
1633
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001634/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001635
1636 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001637 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001638 should be assumed to be PyUnicode_GetDefaultEncoding()).
1639
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001640 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1641 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001642*/
1643char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001644PyTokenizer_FindEncoding(int fd)
1645{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001646 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001647 FILE *fp;
1648 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001649
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001650 fd = dup(fd);
1651 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001652 return NULL;
1653 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001654 fp = fdopen(fd, "r");
1655 if (fp == NULL) {
1656 return NULL;
1657 }
1658 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1659 if (tok == NULL) {
1660 fclose(fp);
1661 return NULL;
1662 }
1663 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001664 PyTokenizer_Get(tok, &p_start, &p_end);
1665 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001666 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001667 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001668 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001669 strcpy(encoding, tok->encoding);
1670 }
1671 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001672 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001673}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001674
Guido van Rossum408027e1996-12-30 16:17:54 +00001675#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001676
1677void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001678tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001679{
Guido van Rossum86bea461997-04-29 21:03:06 +00001680 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001681 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1682 printf("(%.*s)", (int)(end - start), start);
1683}
1684
1685#endif