blob: 2833e532f7f38f56d4420c7356fd2055b957e9ef [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Brett Cannond5ec98c2007-10-20 02:54:14 +000055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056/* Token names */
57
Guido van Rossum86bea461997-04-29 21:03:06 +000058char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059 "ENDMARKER",
60 "NAME",
61 "NUMBER",
62 "STRING",
63 "NEWLINE",
64 "INDENT",
65 "DEDENT",
66 "LPAR",
67 "RPAR",
68 "LSQB",
69 "RSQB",
70 "COLON",
71 "COMMA",
72 "SEMI",
73 "PLUS",
74 "MINUS",
75 "STAR",
76 "SLASH",
77 "VBAR",
78 "AMPER",
79 "LESS",
80 "GREATER",
81 "EQUAL",
82 "DOT",
83 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "LBRACE",
85 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 "EQEQUAL",
87 "NOTEQUAL",
88 "LESSEQUAL",
89 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000090 "TILDE",
91 "CIRCUMFLEX",
92 "LEFTSHIFT",
93 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000094 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000095 "PLUSEQUAL",
96 "MINEQUAL",
97 "STAREQUAL",
98 "SLASHEQUAL",
99 "PERCENTEQUAL",
100 "AMPEREQUAL",
101 "VBAREQUAL",
102 "CIRCUMFLEXEQUAL",
103 "LEFTSHIFTEQUAL",
104 "RIGHTSHIFTEQUAL",
105 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000106 "DOUBLESLASH",
107 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000108 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000109 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000110 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000111 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 "OP",
113 "<ERRORTOKEN>",
114 "<N_TOKENS>"
115};
116
117
118/* Create and initialize a new tok_state structure */
119
120static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000123 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
124 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 if (tok == NULL)
126 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000127 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 tok->done = E_OK;
129 tok->fp = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000137 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000143 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000149 tok->decoding_readline = NULL;
150 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000151#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000152 return tok;
153}
154
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000155#ifdef PGEN
156
157static char *
158decoding_fgets(char *s, int size, struct tok_state *tok)
159{
160 return fgets(s, size, tok->fp);
161}
162
163static int
164decoding_feof(struct tok_state *tok)
165{
166 return feof(tok->fp);
167}
168
169static const char *
170decode_str(const char *str, struct tok_state *tok)
171{
172 return str;
173}
174
175#else /* PGEN */
176
177static char *
178error_ret(struct tok_state *tok) /* XXX */
179{
180 tok->decoding_erred = 1;
181 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 tok->buf = NULL;
184 return NULL; /* as if it were EOF */
185}
186
187static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 if (result != NULL) {
192 memcpy(result, s, len);
193 result[len] = '\0';
194 }
195 return result;
196}
197
198static char *
199get_normal_name(char *s) /* for utf-8 and latin-1 */
200{
201 char buf[13];
202 int i;
203 for (i = 0; i < 12; i++) {
204 int c = s[i];
205 if (c == '\0') break;
206 else if (c == '_') buf[i] = '-';
207 else buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
212 else if (strcmp(buf, "latin-1") == 0 ||
213 strcmp(buf, "iso-8859-1") == 0 ||
214 strcmp(buf, "iso-latin-1") == 0 ||
215 strncmp(buf, "latin-1-", 8) == 0 ||
216 strncmp(buf, "iso-8859-1-", 11) == 0 ||
217 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
218 else return s;
219}
220
221/* Return the coding spec in S, or NULL if none is found. */
222
223static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000224get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000225{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000226 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000227 /* Coding spec must be in a comment, and that comment must be
228 * the only statement on the source code line. */
229 for (i = 0; i < size - 6; i++) {
230 if (s[i] == '#')
231 break;
232 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
233 return NULL;
234 }
235 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236 const char* t = s + i;
237 if (strncmp(t, "coding", 6) == 0) {
238 const char* begin = NULL;
239 t += 6;
240 if (t[0] != ':' && t[0] != '=')
241 continue;
242 do {
243 t++;
244 } while (t[0] == '\x20' || t[0] == '\t');
245
246 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000247 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000248 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 t++;
250
251 if (begin < t) {
252 char* r = new_string(begin, t - begin);
253 char* q = get_normal_name(r);
254 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000256 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 }
258 return r;
259 }
260 }
261 }
262 return NULL;
263}
264
265/* Check whether the line contains a coding spec. If it does,
266 invoke the set_readline function for the new encoding.
267 This function receives the tok_state and the new encoding.
268 Return 1 on success, 0 on failure. */
269
270static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000271check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int set_readline(struct tok_state *, const char *))
273{
Tim Peters17db21f2002-09-03 15:39:58 +0000274 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000276
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000277 if (tok->cont_line)
278 /* It's a continuation line, so it can't be a coding spec. */
279 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000280 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 if (cs != NULL) {
282 tok->read_coding_spec = 1;
283 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 if (strcmp(cs, "utf-8") == 0 ||
286 strcmp(cs, "iso-8859-1") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000326 ch = get_char(tok);
327 if (ch != 0xBB) {
328 unget_char(ch, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
333 ch = get_char(tok);
334 if (ch != 0xBF) {
335 unget_char(ch, tok);
336 unget_char(0xBB, tok);
337 unget_char(0xEF, tok);
338 /* any token beginning with '\xEF' is a bad token */
339 return 1;
340 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341#if 0
342 /* Disable support for UTF-16 BOMs until a decision
343 is made whether this needs to be supported. */
344 } else if (ch == 0xFE) {
345 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
346 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000347 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 } else if (ch == 0xFF) {
349 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
350 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000351 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352#endif
353 } else {
354 unget_char(ch, tok);
355 return 1;
356 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000357 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000359 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000360 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361 return 1;
362}
363
364/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367 On entry, tok->decoding_buffer will be one of:
368 1) NULL: need to call tok->decoding_readline to get a new line
369 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
370 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000371 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 (in the s buffer) to copy entire contents of the line read
373 by tok->decoding_readline. tok->decoding_buffer has the overflow.
374 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 reached): see tok_nextc and its calls to decoding_fgets.
377*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378
379static char *
380fp_readl(char *s, int size, struct tok_state *tok)
381{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000383 const char *buf;
384 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385
386 /* Ask for one less byte so we can terminate it */
387 assert(size > 0);
388 size--;
389
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000390 if (tok->decoding_buffer) {
391 bufobj = tok->decoding_buffer;
392 Py_INCREF(bufobj);
393 }
394 else
395 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000396 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
397 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000400 if (PyUnicode_CheckExact(bufobj))
401 {
402 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
403 if (buf == NULL) {
404 goto error;
405 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000406 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000407 else
408 {
409 buf = PyBytes_AsString(bufobj);
410 if (buf == NULL) {
411 goto error;
412 }
413 buflen = PyBytes_GET_SIZE(bufobj);
414 }
415
416 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000417 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 /* Too many chars, the rest goes into tok->decoding_buffer */
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000419 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
420 buflen-size);
421 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000423 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 else
426 tok->decoding_buffer = NULL;
427
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 memcpy(s, buf, buflen);
429 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000430 if (buflen == 0) /* EOF */
431 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000432 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000434
435error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000437 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000438}
439
440/* Set the readline function for TOK to a StreamReader's
441 readline function. The StreamReader is named ENC.
442
443 This function is called from check_bom and check_coding_spec.
444
445 ENC is usually identical to the future value of tok->encoding,
446 except for the (currently unsupported) case of UTF-16.
447
448 Return 1 on success, 0 on failure. */
449
450static int
451fp_setreadl(struct tok_state *tok, const char* enc)
452{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000453 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
Christian Heimes819b8bf2008-01-03 23:05:47 +0000455 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456 if (io == NULL)
457 goto cleanup;
458
459 stream = PyObject_CallMethod(io, "open", "ssis",
460 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000461 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000464 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000465 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467
468 cleanup:
469 Py_XDECREF(stream);
470 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000471 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472}
473
474/* Fetch the next byte from TOK. */
475
476static int fp_getc(struct tok_state *tok) {
477 return getc(tok->fp);
478}
479
480/* Unfetch the last byte back into TOK. */
481
482static void fp_ungetc(int c, struct tok_state *tok) {
483 ungetc(c, tok->fp);
484}
485
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000486/* Check whether the characters at s start a valid
487 UTF-8 sequence. Return the number of characters forming
488 the sequence if yes, 0 if not. */
489static int valid_utf8(const unsigned char* s)
490{
491 int expected = 0;
492 int length;
493 if (*s < 0x80)
494 /* single-byte code */
495 return 1;
496 if (*s < 0xc0)
497 /* following byte */
498 return 0;
499 if (*s < 0xE0)
500 expected = 1;
501 else if (*s < 0xF0)
502 expected = 2;
503 else if (*s < 0xF8)
504 expected = 3;
505 else
506 return 0;
507 length = expected + 1;
508 for (; expected; expected--)
509 if (s[expected] < 0x80 || s[expected] >= 0xC0)
510 return 0;
511 return length;
512}
513
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514/* Read a line of input from TOK. Determine encoding
515 if necessary. */
516
517static char *
518decoding_fgets(char *s, int size, struct tok_state *tok)
519{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000520 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000521 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000522 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000523 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 /* We already have a codec associated with
525 this input. */
526 line = fp_readl(s, size, tok);
527 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000528 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000530 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 break;
533 } else {
534 /* We have not yet determined the encoding.
535 If an encoding is found, use the file-pointer
536 reader functions from now on. */
537 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
538 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000539 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000541 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
543 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
544 return error_ret(tok);
545 }
546 }
547#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000548 /* The default encoding is UTF-8, so make sure we don't have any
549 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000550 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000552 int length;
553 for (c = (unsigned char *)line; *c; c += length)
554 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 badchar = *c;
556 break;
557 }
558 }
559 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000560 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000561 /* Need to add 1 to the line number, since this line
562 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000563 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000564 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000565 "in file %.200s on line %i, "
566 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000567 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000568 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000569 PyErr_SetString(PyExc_SyntaxError, buf);
570 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 }
572#endif
573 return line;
574}
575
576static int
577decoding_feof(struct tok_state *tok)
578{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000579 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 return feof(tok->fp);
581 } else {
582 PyObject* buf = tok->decoding_buffer;
583 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000584 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 if (buf == NULL) {
586 error_ret(tok);
587 return 1;
588 } else {
589 tok->decoding_buffer = buf;
590 }
591 }
592 return PyObject_Length(buf) == 0;
593 }
594}
595
596/* Fetch a byte from TOK, using the string buffer. */
597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000598static int
599buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000600 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Unfetch a byte from TOK, using the string buffer. */
604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605static void
606buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000608 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609}
610
611/* Set the readline function for TOK to ENC. For the string-based
612 tokenizer, this means to just record the encoding. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000616 tok->enc = enc;
617 return 1;
618}
619
620/* Return a UTF-8 encoding Python string object from the
621 C byte string STR, which is encoded with ENC. */
622
623static PyObject *
624translate_into_utf8(const char* str, const char* enc) {
625 PyObject *utf8;
626 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
627 if (buf == NULL)
628 return NULL;
629 utf8 = PyUnicode_AsUTF8String(buf);
630 Py_DECREF(buf);
631 return utf8;
632}
633
634/* Decode a byte string STR for use as the buffer of TOK.
635 Look for encoding declarations inside STR, and record them
636 inside TOK. */
637
638static const char *
639decode_str(const char *str, struct tok_state *tok)
640{
641 PyObject* utf8 = NULL;
642 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000643 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 int lineno = 0;
645 tok->enc = NULL;
646 tok->str = str;
647 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000648 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000650 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651 if (tok->enc != NULL) {
652 utf8 = translate_into_utf8(str, tok->enc);
653 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000654 return error_ret(tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000655 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656 }
657 for (s = str;; s++) {
658 if (*s == '\0') break;
659 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000660 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000661 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662 lineno++;
663 if (lineno == 2) break;
664 }
665 }
666 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000667 /* need to check line 1 and 2 separately since check_coding_spec
668 assumes a single line as input */
669 if (newl[0]) {
670 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
671 return error_ret(tok);
672 if (tok->enc == NULL && newl[1]) {
673 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
674 tok, buf_setreadl))
675 return error_ret(tok);
676 }
677 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000678 if (tok->enc != NULL) {
679 assert(utf8 == NULL);
680 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000681 if (utf8 == NULL) {
682 PyErr_Format(PyExc_SyntaxError,
683 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000684 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000685 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000686 str = PyString_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000687 }
688 assert(tok->decoding_buffer == NULL);
689 tok->decoding_buffer = utf8; /* CAUTION */
690 return str;
691}
692
693#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000694
695/* Set up tokenizer for string */
696
697struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000698PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000699{
700 struct tok_state *tok = tok_new();
701 if (tok == NULL)
702 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000704 if (str == NULL) {
705 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000706 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000707 }
708
Martin v. Löwis95292d62002-12-11 14:04:59 +0000709 /* XXX: constify members. */
710 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711 return tok;
712}
713
714
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000715/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000716
717struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000718PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000719{
720 struct tok_state *tok = tok_new();
721 if (tok == NULL)
722 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000723 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000724 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000725 return NULL;
726 }
727 tok->cur = tok->inp = tok->buf;
728 tok->end = tok->buf + BUFSIZ;
729 tok->fp = fp;
730 tok->prompt = ps1;
731 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000732 if (enc != NULL) {
733 /* Must copy encoding declaration since it
734 gets copied into the parse tree. */
735 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
736 if (!tok->encoding) {
737 PyTokenizer_Free(tok);
738 return NULL;
739 }
740 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000741 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000742 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743 return tok;
744}
745
746
747/* Free a tok_state structure */
748
749void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000750PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000752 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000753 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000754#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000755 Py_XDECREF(tok->decoding_readline);
756 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000757#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000759 PyMem_FREE(tok->buf);
760 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761}
762
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763/* Get next char, updating state; error code goes into tok->done */
764
765static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000766tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000768 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000769 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000770 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000771 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000772 if (tok->done != E_OK)
773 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000775 char *end = strchr(tok->inp, '\n');
776 if (end != NULL)
777 end++;
778 else {
779 end = strchr(tok->inp, '\0');
780 if (end == tok->inp) {
781 tok->done = E_EOF;
782 return EOF;
783 }
784 }
785 if (tok->start == NULL)
786 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000787 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000788 tok->lineno++;
789 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000790 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000792 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000793 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000794#ifndef PGEN
795 if (tok->encoding && newtok && *newtok) {
796 /* Recode to UTF-8 */
797 Py_ssize_t buflen;
798 const char* buf;
799 PyObject *u = translate_into_utf8(newtok, tok->encoding);
800 PyMem_FREE(newtok);
801 if (!u) {
802 tok->done = E_DECODE;
803 return EOF;
804 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000805 buflen = PyString_GET_SIZE(u);
806 buf = PyString_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000807 if (!buf) {
808 Py_DECREF(u);
809 tok->done = E_DECODE;
810 return EOF;
811 }
812 newtok = PyMem_MALLOC(buflen+1);
813 strcpy(newtok, buf);
814 Py_DECREF(u);
815 }
816#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817 if (tok->nextprompt != NULL)
818 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000819 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000820 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821 else if (*newtok == '\0') {
822 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823 tok->done = E_EOF;
824 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000825 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000826 size_t start = tok->start - tok->buf;
827 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000828 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000829 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000830 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 tok->lineno++;
832 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000833 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000834 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000835 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000836 tok->done = E_NOMEM;
837 return EOF;
838 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 tok->buf = buf;
840 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000841 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000842 strcpy(tok->buf + oldlen, newtok);
843 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 tok->inp = tok->buf + newlen;
845 tok->end = tok->inp + 1;
846 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000847 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 else {
849 tok->lineno++;
850 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000851 PyMem_FREE(tok->buf);
852 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000853 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000855 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 tok->inp = strchr(tok->buf, '\0');
857 tok->end = tok->inp + 1;
858 }
859 }
860 else {
861 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000863 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 if (tok->start == NULL) {
865 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000866 tok->buf = (char *)
867 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000868 if (tok->buf == NULL) {
869 tok->done = E_NOMEM;
870 return EOF;
871 }
872 tok->end = tok->buf + BUFSIZ;
873 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000874 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
875 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000876 tok->done = E_EOF;
877 done = 1;
878 }
879 else {
880 tok->done = E_OK;
881 tok->inp = strchr(tok->buf, '\0');
882 done = tok->inp[-1] == '\n';
883 }
884 }
885 else {
886 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000887 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000888 tok->done = E_EOF;
889 done = 1;
890 }
891 else
892 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000893 }
894 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000896 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897 Py_ssize_t curstart = tok->start == NULL ? -1 :
898 tok->start - tok->buf;
899 Py_ssize_t curvalid = tok->inp - tok->buf;
900 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000901 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000902 newbuf = (char *)PyMem_REALLOC(newbuf,
903 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000904 if (newbuf == NULL) {
905 tok->done = E_NOMEM;
906 tok->cur = tok->inp;
907 return EOF;
908 }
909 tok->buf = newbuf;
910 tok->inp = tok->buf + curvalid;
911 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000912 tok->start = curstart < 0 ? NULL :
913 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000914 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000915 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000916 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000917 /* Break out early on decoding
918 errors, as tok->buf will be NULL
919 */
920 if (tok->decoding_erred)
921 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000922 /* Last line does not end in \n,
923 fake one */
924 strcpy(tok->inp, "\n");
925 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000926 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000927 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000928 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000929 if (tok->buf != NULL) {
930 tok->cur = tok->buf + cur;
931 tok->line_start = tok->cur;
932 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000933 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000934 pt = tok->inp - 2;
935 if (pt >= tok->buf && *pt == '\r') {
936 *pt++ = '\n';
937 *pt = '\0';
938 tok->inp = pt;
939 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000940 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000941 }
942 if (tok->done != E_OK) {
943 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000944 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000945 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946 return EOF;
947 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000948 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000949 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000950}
951
952
953/* Back-up one character */
954
955static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000956tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957{
958 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000959 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000960 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000961 if (*tok->cur != c)
962 *tok->cur = c;
963 }
964}
965
966
967/* Return the token corresponding to a single character */
968
969int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000970PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000971{
972 switch (c) {
973 case '(': return LPAR;
974 case ')': return RPAR;
975 case '[': return LSQB;
976 case ']': return RSQB;
977 case ':': return COLON;
978 case ',': return COMMA;
979 case ';': return SEMI;
980 case '+': return PLUS;
981 case '-': return MINUS;
982 case '*': return STAR;
983 case '/': return SLASH;
984 case '|': return VBAR;
985 case '&': return AMPER;
986 case '<': return LESS;
987 case '>': return GREATER;
988 case '=': return EQUAL;
989 case '.': return DOT;
990 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000991 case '{': return LBRACE;
992 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000993 case '^': return CIRCUMFLEX;
994 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000995 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000996 default: return OP;
997 }
998}
999
1000
Guido van Rossumfbab9051991-10-20 20:25:03 +00001001int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001002PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001003{
1004 switch (c1) {
1005 case '=':
1006 switch (c2) {
1007 case '=': return EQEQUAL;
1008 }
1009 break;
1010 case '!':
1011 switch (c2) {
1012 case '=': return NOTEQUAL;
1013 }
1014 break;
1015 case '<':
1016 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +00001017 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001018 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001019 }
1020 break;
1021 case '>':
1022 switch (c2) {
1023 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001024 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001025 }
1026 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001027 case '+':
1028 switch (c2) {
1029 case '=': return PLUSEQUAL;
1030 }
1031 break;
1032 case '-':
1033 switch (c2) {
1034 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001035 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001036 }
1037 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001038 case '*':
1039 switch (c2) {
1040 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001041 case '=': return STAREQUAL;
1042 }
1043 break;
1044 case '/':
1045 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001046 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001047 case '=': return SLASHEQUAL;
1048 }
1049 break;
1050 case '|':
1051 switch (c2) {
1052 case '=': return VBAREQUAL;
1053 }
1054 break;
1055 case '%':
1056 switch (c2) {
1057 case '=': return PERCENTEQUAL;
1058 }
1059 break;
1060 case '&':
1061 switch (c2) {
1062 case '=': return AMPEREQUAL;
1063 }
1064 break;
1065 case '^':
1066 switch (c2) {
1067 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001068 }
1069 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001070 }
1071 return OP;
1072}
1073
Thomas Wouters434d0822000-08-24 20:11:32 +00001074int
1075PyToken_ThreeChars(int c1, int c2, int c3)
1076{
1077 switch (c1) {
1078 case '<':
1079 switch (c2) {
1080 case '<':
1081 switch (c3) {
1082 case '=':
1083 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001084 }
1085 break;
1086 }
1087 break;
1088 case '>':
1089 switch (c2) {
1090 case '>':
1091 switch (c3) {
1092 case '=':
1093 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001094 }
1095 break;
1096 }
1097 break;
1098 case '*':
1099 switch (c2) {
1100 case '*':
1101 switch (c3) {
1102 case '=':
1103 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001104 }
1105 break;
1106 }
1107 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001108 case '/':
1109 switch (c2) {
1110 case '/':
1111 switch (c3) {
1112 case '=':
1113 return DOUBLESLASHEQUAL;
1114 }
1115 break;
1116 }
1117 break;
Georg Brandldde00282007-03-18 19:01:53 +00001118 case '.':
1119 switch (c2) {
1120 case '.':
1121 switch (c3) {
1122 case '.':
1123 return ELLIPSIS;
1124 }
1125 break;
1126 }
1127 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001128 }
1129 return OP;
1130}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001131
Guido van Rossum926f13a1998-04-09 21:38:06 +00001132static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001133indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001134{
1135 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001136 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001137 tok->cur = tok->inp;
1138 return 1;
1139 }
1140 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001141 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1142 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001143 tok->altwarning = 0;
1144 }
1145 return 0;
1146}
1147
Martin v. Löwis47383402007-08-15 07:32:56 +00001148#ifdef PGEN
1149#define verify_identifier(s,e) 1
1150#else
1151/* Verify that the identifier follows PEP 3131. */
1152static int
1153verify_identifier(char *start, char *end)
1154{
Guido van Rossume3e37012007-08-29 18:54:41 +00001155 PyObject *s;
1156 int result;
1157 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1158 if (s == NULL) {
1159 PyErr_Clear();
1160 return 0;
1161 }
1162 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001163 Py_DECREF(s);
1164 return result;
1165}
1166#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001167
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001168/* Get next token, after space stripping etc. */
1169
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001170static int
1171tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001172{
1173 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001174 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001175
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001176 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001178 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 blankline = 0;
1180
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001181 /* Get indentation level */
1182 if (tok->atbol) {
1183 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001184 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001186 for (;;) {
1187 c = tok_nextc(tok);
1188 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001189 col++, altcol++;
1190 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001192 altcol = (altcol/tok->alttabsize + 1)
1193 * tok->alttabsize;
1194 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001195 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001196 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197 else
1198 break;
1199 }
1200 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001201 if (c == '#' || c == '\n') {
1202 /* Lines with only whitespace and/or comments
1203 shouldn't affect the indentation and are
1204 not passed to the parser as NEWLINE tokens,
1205 except *totally* empty lines in interactive
1206 mode, which signal the end of a command group. */
1207 if (col == 0 && c == '\n' && tok->prompt != NULL)
1208 blankline = 0; /* Let it through */
1209 else
1210 blankline = 1; /* Ignore completely */
1211 /* We can't jump back right here since we still
1212 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001214 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001215 if (col == tok->indstack[tok->indent]) {
1216 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001217 if (altcol != tok->altindstack[tok->indent]) {
1218 if (indenterror(tok))
1219 return ERRORTOKEN;
1220 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001222 else if (col > tok->indstack[tok->indent]) {
1223 /* Indent -- always one */
1224 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001225 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001226 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001227 return ERRORTOKEN;
1228 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001229 if (altcol <= tok->altindstack[tok->indent]) {
1230 if (indenterror(tok))
1231 return ERRORTOKEN;
1232 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001233 tok->pendin++;
1234 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001235 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001236 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001237 else /* col < tok->indstack[tok->indent] */ {
1238 /* Dedent -- any number, must be consistent */
1239 while (tok->indent > 0 &&
1240 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001241 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001242 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001243 }
1244 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001245 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001246 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001247 return ERRORTOKEN;
1248 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001249 if (altcol != tok->altindstack[tok->indent]) {
1250 if (indenterror(tok))
1251 return ERRORTOKEN;
1252 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 }
1254 }
1255 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001256
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001257 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001258
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 /* Return pending indents/dedents */
1260 if (tok->pendin != 0) {
1261 if (tok->pendin < 0) {
1262 tok->pendin++;
1263 return DEDENT;
1264 }
1265 else {
1266 tok->pendin--;
1267 return INDENT;
1268 }
1269 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001270
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001272 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 /* Skip spaces */
1274 do {
1275 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001276 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001277
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001279 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001280
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001281 /* Skip comment */
1282 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001283 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001285
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001287 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001289 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001290
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001292 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001293 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001294 /* Process b"", r"" and br"" */
1295 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001296 c = tok_nextc(tok);
1297 if (c == '"' || c == '\'')
1298 goto letter_quote;
1299 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001300 if (c == 'r' || c == 'R') {
1301 c = tok_nextc(tok);
1302 if (c == '"' || c == '\'')
1303 goto letter_quote;
1304 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001305 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001306 if (c >= 128)
1307 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001308 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001309 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001311 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001312 !verify_identifier(tok->start, tok->cur)) {
1313 tok->done = E_IDENTIFIER;
1314 return ERRORTOKEN;
1315 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001316 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 *p_end = tok->cur;
1318 return NAME;
1319 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001320
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 /* Newline */
1322 if (c == '\n') {
1323 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001324 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001325 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001326 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001328 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 return NEWLINE;
1330 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001331
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001332 /* Period or number starting with period? */
1333 if (c == '.') {
1334 c = tok_nextc(tok);
1335 if (isdigit(c)) {
1336 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001337 } else if (c == '.') {
1338 c = tok_nextc(tok);
1339 if (c == '.') {
1340 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001341 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001342 return ELLIPSIS;
1343 } else {
1344 tok_backup(tok, c);
1345 }
1346 tok_backup(tok, '.');
1347 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001348 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001349 }
Georg Brandldde00282007-03-18 19:01:53 +00001350 *p_start = tok->start;
1351 *p_end = tok->cur;
1352 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001353 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001354
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355 /* Number */
1356 if (isdigit(c)) {
1357 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001358 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001359 c = tok_nextc(tok);
1360 if (c == '.')
1361 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001362#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001363 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001364 goto imaginary;
1365#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001367
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001369 c = tok_nextc(tok);
1370 if (!isxdigit(c)) {
1371 tok->done = E_TOKEN;
1372 tok_backup(tok, c);
1373 return ERRORTOKEN;
1374 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 do {
1376 c = tok_nextc(tok);
1377 } while (isxdigit(c));
1378 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001379 else if (c == 'o' || c == 'O') {
1380 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001381 c = tok_nextc(tok);
1382 if (c < '0' || c > '8') {
1383 tok->done = E_TOKEN;
1384 tok_backup(tok, c);
1385 return ERRORTOKEN;
1386 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001387 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001389 } while ('0' <= c && c < '8');
1390 }
1391 else if (c == 'b' || c == 'B') {
1392 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001393 c = tok_nextc(tok);
1394 if (c != '0' && c != '1') {
1395 tok->done = E_TOKEN;
1396 tok_backup(tok, c);
1397 return ERRORTOKEN;
1398 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001399 do {
1400 c = tok_nextc(tok);
1401 } while (c == '0' || c == '1');
1402 }
1403 else {
1404 int nonzero = 0;
1405 /* maybe old-style octal; c is first char of it */
1406 /* in any case, allow '0' as a literal */
1407 while (c == '0')
1408 c = tok_nextc(tok);
1409 while (isdigit(c)) {
1410 nonzero = 1;
1411 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001412 }
1413 if (c == '.')
1414 goto fraction;
1415 else if (c == 'e' || c == 'E')
1416 goto exponent;
1417#ifndef WITHOUT_COMPLEX
1418 else if (c == 'j' || c == 'J')
1419 goto imaginary;
1420#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001421 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001422 tok->done = E_TOKEN;
1423 tok_backup(tok, c);
1424 return ERRORTOKEN;
1425 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426 }
1427 }
1428 else {
1429 /* Decimal */
1430 do {
1431 c = tok_nextc(tok);
1432 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001433 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001434 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001435 if (c == '.') {
1436 fraction:
1437 /* Fraction */
1438 do {
1439 c = tok_nextc(tok);
1440 } while (isdigit(c));
1441 }
1442 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001443 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001444 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001445 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001446 if (c == '+' || c == '-')
1447 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001448 if (!isdigit(c)) {
1449 tok->done = E_TOKEN;
1450 tok_backup(tok, c);
1451 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001452 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001453 do {
1454 c = tok_nextc(tok);
1455 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001457#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001458 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001459 /* Imaginary part */
1460 imaginary:
1461 c = tok_nextc(tok);
1462#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 }
1464 }
1465 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001466 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467 *p_end = tok->cur;
1468 return NUMBER;
1469 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001470
1471 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001472 /* String */
1473 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001474 int quote = c;
1475 int quote_size = 1; /* 1 or 3 */
1476 int end_quote_size = 0;
1477
1478 /* Find the quote size and start of string */
1479 c = tok_nextc(tok);
1480 if (c == quote) {
1481 c = tok_nextc(tok);
1482 if (c == quote)
1483 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001484 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001485 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001487 if (c != quote)
1488 tok_backup(tok, c);
1489
1490 /* Get rest of string */
1491 while (end_quote_size != quote_size) {
1492 c = tok_nextc(tok);
1493 if (c == EOF) {
1494 if (quote_size == 3)
1495 tok->done = E_EOFS;
1496 else
1497 tok->done = E_EOLS;
1498 tok->cur = tok->inp;
1499 return ERRORTOKEN;
1500 }
1501 if (quote_size == 1 && c == '\n') {
1502 tok->done = E_EOLS;
1503 tok->cur = tok->inp;
1504 return ERRORTOKEN;
1505 }
1506 if (c == quote)
1507 end_quote_size += 1;
1508 else {
1509 end_quote_size = 0;
1510 if (c == '\\')
1511 c = tok_nextc(tok); /* skip escaped char */
1512 }
1513 }
1514
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001515 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001516 *p_end = tok->cur;
1517 return STRING;
1518 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001519
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001520 /* Line continuation */
1521 if (c == '\\') {
1522 c = tok_nextc(tok);
1523 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001524 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001525 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526 return ERRORTOKEN;
1527 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001528 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001529 goto again; /* Read next line */
1530 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531
Guido van Rossumfbab9051991-10-20 20:25:03 +00001532 /* Check for two-character token */
1533 {
1534 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001535 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001536 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001537 int c3 = tok_nextc(tok);
1538 int token3 = PyToken_ThreeChars(c, c2, c3);
1539 if (token3 != OP) {
1540 token = token3;
1541 } else {
1542 tok_backup(tok, c3);
1543 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001544 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001545 *p_end = tok->cur;
1546 return token;
1547 }
1548 tok_backup(tok, c2);
1549 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001551 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001552 switch (c) {
1553 case '(':
1554 case '[':
1555 case '{':
1556 tok->level++;
1557 break;
1558 case ')':
1559 case ']':
1560 case '}':
1561 tok->level--;
1562 break;
1563 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001566 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001567 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001568 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001569}
1570
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001571int
1572PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1573{
1574 int result = tok_get(tok, p_start, p_end);
1575 if (tok->decoding_erred) {
1576 result = ERRORTOKEN;
1577 tok->done = E_DECODE;
1578 }
1579 return result;
1580}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001581
Thomas Wouters89d996e2007-09-08 17:39:28 +00001582/* This function is only called from parsetok. However, it cannot live
1583 there, as it must be empty for PGEN, and we can check for PGEN only
1584 in this file. */
1585
1586#ifdef PGEN
1587char*
1588PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1589{
1590 return NULL;
1591}
1592#else
1593static PyObject *
1594dec_utf8(const char *enc, const char *text, size_t len) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001595 PyObject *ret = NULL;
Thomas Wouters89d996e2007-09-08 17:39:28 +00001596 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1597 if (unicode_text) {
1598 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1599 Py_DECREF(unicode_text);
1600 }
1601 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001602 PyErr_Clear();
1603 }
1604 else {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001605 assert(PyString_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001606 }
1607 return ret;
1608}
1609
1610char *
1611PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1612{
1613 char *text = NULL;
1614 if (tok->encoding) {
1615 /* convert source to original encondig */
1616 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1617 if (lineobj != NULL) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001618 int linelen = PyString_GET_SIZE(lineobj);
1619 const char *line = PyString_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001620 text = PyObject_MALLOC(linelen + 1);
1621 if (text != NULL && line != NULL) {
1622 if (linelen)
1623 strncpy(text, line, linelen);
1624 text[linelen] = '\0';
1625 }
1626 Py_DECREF(lineobj);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001627
Thomas Wouters89d996e2007-09-08 17:39:28 +00001628 /* adjust error offset */
1629 if (*offset > 1) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001630 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001631 tok->buf,
1632 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001633 if (offsetobj) {
Christian Heimes90aa7642007-12-19 02:45:37 +00001634 *offset = 1 + Py_SIZE(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001635 Py_DECREF(offsetobj);
1636 }
1637 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001638
Thomas Wouters89d996e2007-09-08 17:39:28 +00001639 }
1640 }
1641 return text;
1642
1643}
1644#endif
1645
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001646/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001647
1648 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001649 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001650 should be assumed to be PyUnicode_GetDefaultEncoding()).
1651
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001652 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1653 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001654*/
1655char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001656PyTokenizer_FindEncoding(int fd)
1657{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001658 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001659 FILE *fp;
1660 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001661
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001662 fd = dup(fd);
1663 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001664 return NULL;
1665 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001666 fp = fdopen(fd, "r");
1667 if (fp == NULL) {
1668 return NULL;
1669 }
1670 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1671 if (tok == NULL) {
1672 fclose(fp);
1673 return NULL;
1674 }
1675 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001676 PyTokenizer_Get(tok, &p_start, &p_end);
1677 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001678 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001679 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001680 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001681 strcpy(encoding, tok->encoding);
1682 }
1683 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001684 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001685}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001686
Guido van Rossum408027e1996-12-30 16:17:54 +00001687#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001688
1689void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001690tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001691{
Guido van Rossum86bea461997-04-29 21:03:06 +00001692 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001693 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1694 printf("(%.*s)", (int)(end - start), start);
1695}
1696
1697#endif