blob: 991d7609f82ee98084006f50dcc21e23bea4325c [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Brett Cannond5ec98c2007-10-20 02:54:14 +000055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056/* Token names */
57
Guido van Rossum86bea461997-04-29 21:03:06 +000058char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059 "ENDMARKER",
60 "NAME",
61 "NUMBER",
62 "STRING",
63 "NEWLINE",
64 "INDENT",
65 "DEDENT",
66 "LPAR",
67 "RPAR",
68 "LSQB",
69 "RSQB",
70 "COLON",
71 "COMMA",
72 "SEMI",
73 "PLUS",
74 "MINUS",
75 "STAR",
76 "SLASH",
77 "VBAR",
78 "AMPER",
79 "LESS",
80 "GREATER",
81 "EQUAL",
82 "DOT",
83 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "LBRACE",
85 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 "EQEQUAL",
87 "NOTEQUAL",
88 "LESSEQUAL",
89 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000090 "TILDE",
91 "CIRCUMFLEX",
92 "LEFTSHIFT",
93 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000094 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000095 "PLUSEQUAL",
96 "MINEQUAL",
97 "STAREQUAL",
98 "SLASHEQUAL",
99 "PERCENTEQUAL",
100 "AMPEREQUAL",
101 "VBAREQUAL",
102 "CIRCUMFLEXEQUAL",
103 "LEFTSHIFTEQUAL",
104 "RIGHTSHIFTEQUAL",
105 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000106 "DOUBLESLASH",
107 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000108 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000109 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000110 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000111 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 "OP",
113 "<ERRORTOKEN>",
114 "<N_TOKENS>"
115};
116
117
118/* Create and initialize a new tok_state structure */
119
120static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000123 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
124 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 if (tok == NULL)
126 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000127 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 tok->done = E_OK;
129 tok->fp = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000137 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000143 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000149 tok->decoding_readline = NULL;
150 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000151#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000152 return tok;
153}
154
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000155#ifdef PGEN
156
157static char *
158decoding_fgets(char *s, int size, struct tok_state *tok)
159{
160 return fgets(s, size, tok->fp);
161}
162
163static int
164decoding_feof(struct tok_state *tok)
165{
166 return feof(tok->fp);
167}
168
169static const char *
170decode_str(const char *str, struct tok_state *tok)
171{
172 return str;
173}
174
175#else /* PGEN */
176
177static char *
178error_ret(struct tok_state *tok) /* XXX */
179{
180 tok->decoding_erred = 1;
181 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 tok->buf = NULL;
184 return NULL; /* as if it were EOF */
185}
186
187static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 if (result != NULL) {
192 memcpy(result, s, len);
193 result[len] = '\0';
194 }
195 return result;
196}
197
198static char *
199get_normal_name(char *s) /* for utf-8 and latin-1 */
200{
201 char buf[13];
202 int i;
203 for (i = 0; i < 12; i++) {
204 int c = s[i];
205 if (c == '\0') break;
206 else if (c == '_') buf[i] = '-';
207 else buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
212 else if (strcmp(buf, "latin-1") == 0 ||
213 strcmp(buf, "iso-8859-1") == 0 ||
214 strcmp(buf, "iso-latin-1") == 0 ||
215 strncmp(buf, "latin-1-", 8) == 0 ||
216 strncmp(buf, "iso-8859-1-", 11) == 0 ||
217 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
218 else return s;
219}
220
221/* Return the coding spec in S, or NULL if none is found. */
222
223static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000224get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000225{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000226 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000227 /* Coding spec must be in a comment, and that comment must be
228 * the only statement on the source code line. */
229 for (i = 0; i < size - 6; i++) {
230 if (s[i] == '#')
231 break;
232 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
233 return NULL;
234 }
235 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236 const char* t = s + i;
237 if (strncmp(t, "coding", 6) == 0) {
238 const char* begin = NULL;
239 t += 6;
240 if (t[0] != ':' && t[0] != '=')
241 continue;
242 do {
243 t++;
244 } while (t[0] == '\x20' || t[0] == '\t');
245
246 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000247 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000248 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 t++;
250
251 if (begin < t) {
252 char* r = new_string(begin, t - begin);
253 char* q = get_normal_name(r);
254 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000256 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 }
258 return r;
259 }
260 }
261 }
262 return NULL;
263}
264
265/* Check whether the line contains a coding spec. If it does,
266 invoke the set_readline function for the new encoding.
267 This function receives the tok_state and the new encoding.
268 Return 1 on success, 0 on failure. */
269
270static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000271check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int set_readline(struct tok_state *, const char *))
273{
Tim Peters17db21f2002-09-03 15:39:58 +0000274 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000276
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000277 if (tok->cont_line)
278 /* It's a continuation line, so it can't be a coding spec. */
279 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000280 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 if (cs != NULL) {
282 tok->read_coding_spec = 1;
283 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 if (strcmp(cs, "utf-8") == 0 ||
286 strcmp(cs, "iso-8859-1") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000326 ch = get_char(tok);
327 if (ch != 0xBB) {
328 unget_char(ch, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
333 ch = get_char(tok);
334 if (ch != 0xBF) {
335 unget_char(ch, tok);
336 unget_char(0xBB, tok);
337 unget_char(0xEF, tok);
338 /* any token beginning with '\xEF' is a bad token */
339 return 1;
340 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341#if 0
342 /* Disable support for UTF-16 BOMs until a decision
343 is made whether this needs to be supported. */
344 } else if (ch == 0xFE) {
345 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
346 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000347 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000348 } else if (ch == 0xFF) {
349 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
350 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000351 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000352#endif
353 } else {
354 unget_char(ch, tok);
355 return 1;
356 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000357 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000359 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000360 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361 return 1;
362}
363
364/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000367 On entry, tok->decoding_buffer will be one of:
368 1) NULL: need to call tok->decoding_readline to get a new line
369 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
370 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000371 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372 (in the s buffer) to copy entire contents of the line read
373 by tok->decoding_readline. tok->decoding_buffer has the overflow.
374 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 reached): see tok_nextc and its calls to decoding_fgets.
377*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378
379static char *
380fp_readl(char *s, int size, struct tok_state *tok)
381{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000383 const char *buf;
384 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385
386 /* Ask for one less byte so we can terminate it */
387 assert(size > 0);
388 size--;
389
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000390 if (tok->decoding_buffer) {
391 bufobj = tok->decoding_buffer;
392 Py_INCREF(bufobj);
393 }
394 else
395 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000396 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
397 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000400 if (PyUnicode_CheckExact(bufobj))
401 {
402 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
403 if (buf == NULL) {
404 goto error;
405 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000406 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000407 else
408 {
409 buf = PyBytes_AsString(bufobj);
410 if (buf == NULL) {
411 goto error;
412 }
413 buflen = PyBytes_GET_SIZE(bufobj);
414 }
415
416 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000417 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 /* Too many chars, the rest goes into tok->decoding_buffer */
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000419 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
420 buflen-size);
421 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000423 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000424 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 else
426 tok->decoding_buffer = NULL;
427
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 memcpy(s, buf, buflen);
429 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000430 if (buflen == 0) /* EOF */
431 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000432 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000434
435error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000437 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000438}
439
440/* Set the readline function for TOK to a StreamReader's
441 readline function. The StreamReader is named ENC.
442
443 This function is called from check_bom and check_coding_spec.
444
445 ENC is usually identical to the future value of tok->encoding,
446 except for the (currently unsupported) case of UTF-16.
447
448 Return 1 on success, 0 on failure. */
449
450static int
451fp_setreadl(struct tok_state *tok, const char* enc)
452{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000453 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454
Christian Heimes819b8bf2008-01-03 23:05:47 +0000455 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000456 if (io == NULL)
457 goto cleanup;
458
459 stream = PyObject_CallMethod(io, "open", "ssis",
460 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000461 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000464 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000465 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467
468 cleanup:
469 Py_XDECREF(stream);
470 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000471 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000472}
473
474/* Fetch the next byte from TOK. */
475
476static int fp_getc(struct tok_state *tok) {
477 return getc(tok->fp);
478}
479
480/* Unfetch the last byte back into TOK. */
481
482static void fp_ungetc(int c, struct tok_state *tok) {
483 ungetc(c, tok->fp);
484}
485
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000486/* Check whether the characters at s start a valid
487 UTF-8 sequence. Return the number of characters forming
488 the sequence if yes, 0 if not. */
489static int valid_utf8(const unsigned char* s)
490{
491 int expected = 0;
492 int length;
493 if (*s < 0x80)
494 /* single-byte code */
495 return 1;
496 if (*s < 0xc0)
497 /* following byte */
498 return 0;
499 if (*s < 0xE0)
500 expected = 1;
501 else if (*s < 0xF0)
502 expected = 2;
503 else if (*s < 0xF8)
504 expected = 3;
505 else
506 return 0;
507 length = expected + 1;
508 for (; expected; expected--)
509 if (s[expected] < 0x80 || s[expected] >= 0xC0)
510 return 0;
511 return length;
512}
513
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514/* Read a line of input from TOK. Determine encoding
515 if necessary. */
516
517static char *
518decoding_fgets(char *s, int size, struct tok_state *tok)
519{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000520 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000521 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000522 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000523 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 /* We already have a codec associated with
525 this input. */
526 line = fp_readl(s, size, tok);
527 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000528 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000530 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 break;
533 } else {
534 /* We have not yet determined the encoding.
535 If an encoding is found, use the file-pointer
536 reader functions from now on. */
537 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
538 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000539 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000541 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
543 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
544 return error_ret(tok);
545 }
546 }
547#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000548 /* The default encoding is UTF-8, so make sure we don't have any
549 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000550 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000552 int length;
553 for (c = (unsigned char *)line; *c; c += length)
554 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 badchar = *c;
556 break;
557 }
558 }
559 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000560 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000561 /* Need to add 1 to the line number, since this line
562 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000563 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000564 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000565 "in file %.200s on line %i, "
566 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000567 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000568 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000569 PyErr_SetString(PyExc_SyntaxError, buf);
570 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 }
572#endif
573 return line;
574}
575
576static int
577decoding_feof(struct tok_state *tok)
578{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000579 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 return feof(tok->fp);
581 } else {
582 PyObject* buf = tok->decoding_buffer;
583 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000584 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000585 if (buf == NULL) {
586 error_ret(tok);
587 return 1;
588 } else {
589 tok->decoding_buffer = buf;
590 }
591 }
592 return PyObject_Length(buf) == 0;
593 }
594}
595
596/* Fetch a byte from TOK, using the string buffer. */
597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000598static int
599buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000600 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Unfetch a byte from TOK, using the string buffer. */
604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605static void
606buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000608 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609}
610
611/* Set the readline function for TOK to ENC. For the string-based
612 tokenizer, this means to just record the encoding. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000616 tok->enc = enc;
617 return 1;
618}
619
620/* Return a UTF-8 encoding Python string object from the
621 C byte string STR, which is encoded with ENC. */
622
623static PyObject *
624translate_into_utf8(const char* str, const char* enc) {
625 PyObject *utf8;
626 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
627 if (buf == NULL)
628 return NULL;
629 utf8 = PyUnicode_AsUTF8String(buf);
630 Py_DECREF(buf);
631 return utf8;
632}
633
634/* Decode a byte string STR for use as the buffer of TOK.
635 Look for encoding declarations inside STR, and record them
636 inside TOK. */
637
638static const char *
639decode_str(const char *str, struct tok_state *tok)
640{
641 PyObject* utf8 = NULL;
642 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000643 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 int lineno = 0;
645 tok->enc = NULL;
646 tok->str = str;
647 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000648 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000650 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651 if (tok->enc != NULL) {
652 utf8 = translate_into_utf8(str, tok->enc);
653 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000654 return error_ret(tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000655 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656 }
657 for (s = str;; s++) {
658 if (*s == '\0') break;
659 else if (*s == '\n') {
Georg Brandl86def6c2008-01-21 20:36:10 +0000660 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 lineno++;
662 if (lineno == 2) break;
663 }
664 }
665 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000666 /* need to check line 1 and 2 separately since check_coding_spec
667 assumes a single line as input */
668 if (newl[0]) {
669 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
670 return error_ret(tok);
671 if (tok->enc == NULL && newl[1]) {
672 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
673 tok, buf_setreadl))
674 return error_ret(tok);
675 }
676 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000677 if (tok->enc != NULL) {
678 assert(utf8 == NULL);
679 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000680 if (utf8 == NULL) {
681 PyErr_Format(PyExc_SyntaxError,
682 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000683 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000684 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000685 str = PyString_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000686 }
687 assert(tok->decoding_buffer == NULL);
688 tok->decoding_buffer = utf8; /* CAUTION */
689 return str;
690}
691
692#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000693
694/* Set up tokenizer for string */
695
696struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000697PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000698{
699 struct tok_state *tok = tok_new();
700 if (tok == NULL)
701 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000703 if (str == NULL) {
704 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000706 }
707
Martin v. Löwis95292d62002-12-11 14:04:59 +0000708 /* XXX: constify members. */
709 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000710 return tok;
711}
712
713
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000714/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715
716struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000717PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000718{
719 struct tok_state *tok = tok_new();
720 if (tok == NULL)
721 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000722 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000723 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724 return NULL;
725 }
726 tok->cur = tok->inp = tok->buf;
727 tok->end = tok->buf + BUFSIZ;
728 tok->fp = fp;
729 tok->prompt = ps1;
730 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000731 if (enc != NULL) {
732 /* Must copy encoding declaration since it
733 gets copied into the parse tree. */
734 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
735 if (!tok->encoding) {
736 PyTokenizer_Free(tok);
737 return NULL;
738 }
739 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000740 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000741 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 return tok;
743}
744
745
746/* Free a tok_state structure */
747
748void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000749PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000751 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000752 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000753#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000754 Py_XDECREF(tok->decoding_readline);
755 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000756#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000758 PyMem_FREE(tok->buf);
759 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760}
761
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762/* Get next char, updating state; error code goes into tok->done */
763
764static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000765tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000768 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000769 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000770 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000771 if (tok->done != E_OK)
772 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000774 char *end = strchr(tok->inp, '\n');
775 if (end != NULL)
776 end++;
777 else {
778 end = strchr(tok->inp, '\0');
779 if (end == tok->inp) {
780 tok->done = E_EOF;
781 return EOF;
782 }
783 }
784 if (tok->start == NULL)
785 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000786 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000787 tok->lineno++;
788 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000789 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000790 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000792 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000793#ifndef PGEN
794 if (tok->encoding && newtok && *newtok) {
795 /* Recode to UTF-8 */
796 Py_ssize_t buflen;
797 const char* buf;
798 PyObject *u = translate_into_utf8(newtok, tok->encoding);
799 PyMem_FREE(newtok);
800 if (!u) {
801 tok->done = E_DECODE;
802 return EOF;
803 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000804 buflen = PyString_GET_SIZE(u);
805 buf = PyString_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000806 if (!buf) {
807 Py_DECREF(u);
808 tok->done = E_DECODE;
809 return EOF;
810 }
811 newtok = PyMem_MALLOC(buflen+1);
812 strcpy(newtok, buf);
813 Py_DECREF(u);
814 }
815#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816 if (tok->nextprompt != NULL)
817 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000818 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000820 else if (*newtok == '\0') {
821 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822 tok->done = E_EOF;
823 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000825 size_t start = tok->start - tok->buf;
826 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000827 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000828 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 tok->lineno++;
831 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000832 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000833 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000834 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000835 tok->done = E_NOMEM;
836 return EOF;
837 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000838 tok->buf = buf;
839 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000840 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000841 strcpy(tok->buf + oldlen, newtok);
842 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000843 tok->inp = tok->buf + newlen;
844 tok->end = tok->inp + 1;
845 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000846 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000847 else {
848 tok->lineno++;
849 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000850 PyMem_FREE(tok->buf);
851 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000852 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000853 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000854 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000855 tok->inp = strchr(tok->buf, '\0');
856 tok->end = tok->inp + 1;
857 }
858 }
859 else {
860 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000861 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000862 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000863 if (tok->start == NULL) {
864 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000865 tok->buf = (char *)
866 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000867 if (tok->buf == NULL) {
868 tok->done = E_NOMEM;
869 return EOF;
870 }
871 tok->end = tok->buf + BUFSIZ;
872 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000873 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
874 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000875 tok->done = E_EOF;
876 done = 1;
877 }
878 else {
879 tok->done = E_OK;
880 tok->inp = strchr(tok->buf, '\0');
881 done = tok->inp[-1] == '\n';
882 }
883 }
884 else {
885 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000886 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000887 tok->done = E_EOF;
888 done = 1;
889 }
890 else
891 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 }
893 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000895 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000896 Py_ssize_t curstart = tok->start == NULL ? -1 :
897 tok->start - tok->buf;
898 Py_ssize_t curvalid = tok->inp - tok->buf;
899 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 newbuf = (char *)PyMem_REALLOC(newbuf,
902 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000903 if (newbuf == NULL) {
904 tok->done = E_NOMEM;
905 tok->cur = tok->inp;
906 return EOF;
907 }
908 tok->buf = newbuf;
909 tok->inp = tok->buf + curvalid;
910 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000911 tok->start = curstart < 0 ? NULL :
912 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000913 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000914 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000915 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000916 /* Break out early on decoding
917 errors, as tok->buf will be NULL
918 */
919 if (tok->decoding_erred)
920 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000921 /* Last line does not end in \n,
922 fake one */
923 strcpy(tok->inp, "\n");
924 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000925 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000926 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000927 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000928 if (tok->buf != NULL) {
929 tok->cur = tok->buf + cur;
930 tok->line_start = tok->cur;
931 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000932 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000933 pt = tok->inp - 2;
934 if (pt >= tok->buf && *pt == '\r') {
935 *pt++ = '\n';
936 *pt = '\0';
937 tok->inp = pt;
938 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000939 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000940 }
941 if (tok->done != E_OK) {
942 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000943 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000944 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000945 return EOF;
946 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000947 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000948 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949}
950
951
952/* Back-up one character */
953
954static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000955tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000956{
957 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000958 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000959 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000960 if (*tok->cur != c)
961 *tok->cur = c;
962 }
963}
964
965
966/* Return the token corresponding to a single character */
967
968int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000969PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000970{
971 switch (c) {
972 case '(': return LPAR;
973 case ')': return RPAR;
974 case '[': return LSQB;
975 case ']': return RSQB;
976 case ':': return COLON;
977 case ',': return COMMA;
978 case ';': return SEMI;
979 case '+': return PLUS;
980 case '-': return MINUS;
981 case '*': return STAR;
982 case '/': return SLASH;
983 case '|': return VBAR;
984 case '&': return AMPER;
985 case '<': return LESS;
986 case '>': return GREATER;
987 case '=': return EQUAL;
988 case '.': return DOT;
989 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990 case '{': return LBRACE;
991 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000992 case '^': return CIRCUMFLEX;
993 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000994 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995 default: return OP;
996 }
997}
998
999
Guido van Rossumfbab9051991-10-20 20:25:03 +00001000int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001001PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001002{
1003 switch (c1) {
1004 case '=':
1005 switch (c2) {
1006 case '=': return EQEQUAL;
1007 }
1008 break;
1009 case '!':
1010 switch (c2) {
1011 case '=': return NOTEQUAL;
1012 }
1013 break;
1014 case '<':
1015 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +00001016 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001017 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001018 }
1019 break;
1020 case '>':
1021 switch (c2) {
1022 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001023 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001024 }
1025 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001026 case '+':
1027 switch (c2) {
1028 case '=': return PLUSEQUAL;
1029 }
1030 break;
1031 case '-':
1032 switch (c2) {
1033 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001034 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001035 }
1036 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001037 case '*':
1038 switch (c2) {
1039 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001040 case '=': return STAREQUAL;
1041 }
1042 break;
1043 case '/':
1044 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001045 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001046 case '=': return SLASHEQUAL;
1047 }
1048 break;
1049 case '|':
1050 switch (c2) {
1051 case '=': return VBAREQUAL;
1052 }
1053 break;
1054 case '%':
1055 switch (c2) {
1056 case '=': return PERCENTEQUAL;
1057 }
1058 break;
1059 case '&':
1060 switch (c2) {
1061 case '=': return AMPEREQUAL;
1062 }
1063 break;
1064 case '^':
1065 switch (c2) {
1066 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001067 }
1068 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001069 }
1070 return OP;
1071}
1072
Thomas Wouters434d0822000-08-24 20:11:32 +00001073int
1074PyToken_ThreeChars(int c1, int c2, int c3)
1075{
1076 switch (c1) {
1077 case '<':
1078 switch (c2) {
1079 case '<':
1080 switch (c3) {
1081 case '=':
1082 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 }
1084 break;
1085 }
1086 break;
1087 case '>':
1088 switch (c2) {
1089 case '>':
1090 switch (c3) {
1091 case '=':
1092 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001093 }
1094 break;
1095 }
1096 break;
1097 case '*':
1098 switch (c2) {
1099 case '*':
1100 switch (c3) {
1101 case '=':
1102 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001103 }
1104 break;
1105 }
1106 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001107 case '/':
1108 switch (c2) {
1109 case '/':
1110 switch (c3) {
1111 case '=':
1112 return DOUBLESLASHEQUAL;
1113 }
1114 break;
1115 }
1116 break;
Georg Brandldde00282007-03-18 19:01:53 +00001117 case '.':
1118 switch (c2) {
1119 case '.':
1120 switch (c3) {
1121 case '.':
1122 return ELLIPSIS;
1123 }
1124 break;
1125 }
1126 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001127 }
1128 return OP;
1129}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001130
Guido van Rossum926f13a1998-04-09 21:38:06 +00001131static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001132indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133{
1134 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001135 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001136 tok->cur = tok->inp;
1137 return 1;
1138 }
1139 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001140 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1141 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001142 tok->altwarning = 0;
1143 }
1144 return 0;
1145}
1146
Martin v. Löwis47383402007-08-15 07:32:56 +00001147#ifdef PGEN
1148#define verify_identifier(s,e) 1
1149#else
1150/* Verify that the identifier follows PEP 3131. */
1151static int
1152verify_identifier(char *start, char *end)
1153{
Guido van Rossume3e37012007-08-29 18:54:41 +00001154 PyObject *s;
1155 int result;
1156 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1157 if (s == NULL) {
1158 PyErr_Clear();
1159 return 0;
1160 }
1161 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001162 Py_DECREF(s);
1163 return result;
1164}
1165#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001166
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001167/* Get next token, after space stripping etc. */
1168
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001169static int
1170tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001171{
1172 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001173 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001175 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001176 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001177 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001178 blankline = 0;
1179
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 /* Get indentation level */
1181 if (tok->atbol) {
1182 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001183 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 for (;;) {
1186 c = tok_nextc(tok);
1187 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001188 col++, altcol++;
1189 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001190 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001191 altcol = (altcol/tok->alttabsize + 1)
1192 * tok->alttabsize;
1193 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001194 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001195 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001196 else
1197 break;
1198 }
1199 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200 if (c == '#' || c == '\n') {
1201 /* Lines with only whitespace and/or comments
1202 shouldn't affect the indentation and are
1203 not passed to the parser as NEWLINE tokens,
1204 except *totally* empty lines in interactive
1205 mode, which signal the end of a command group. */
1206 if (col == 0 && c == '\n' && tok->prompt != NULL)
1207 blankline = 0; /* Let it through */
1208 else
1209 blankline = 1; /* Ignore completely */
1210 /* We can't jump back right here since we still
1211 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001213 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001214 if (col == tok->indstack[tok->indent]) {
1215 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001216 if (altcol != tok->altindstack[tok->indent]) {
1217 if (indenterror(tok))
1218 return ERRORTOKEN;
1219 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001221 else if (col > tok->indstack[tok->indent]) {
1222 /* Indent -- always one */
1223 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001224 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001225 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001226 return ERRORTOKEN;
1227 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228 if (altcol <= tok->altindstack[tok->indent]) {
1229 if (indenterror(tok))
1230 return ERRORTOKEN;
1231 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001232 tok->pendin++;
1233 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001234 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001236 else /* col < tok->indstack[tok->indent] */ {
1237 /* Dedent -- any number, must be consistent */
1238 while (tok->indent > 0 &&
1239 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001240 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001241 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001242 }
1243 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001244 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001245 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001246 return ERRORTOKEN;
1247 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001248 if (altcol != tok->altindstack[tok->indent]) {
1249 if (indenterror(tok))
1250 return ERRORTOKEN;
1251 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001252 }
1253 }
1254 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001255
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001256 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001257
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 /* Return pending indents/dedents */
1259 if (tok->pendin != 0) {
1260 if (tok->pendin < 0) {
1261 tok->pendin++;
1262 return DEDENT;
1263 }
1264 else {
1265 tok->pendin--;
1266 return INDENT;
1267 }
1268 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001269
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001270 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001271 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 /* Skip spaces */
1273 do {
1274 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001275 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001276
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001278 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001279
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001280 /* Skip comment */
1281 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001282 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001284
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001286 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001288 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001291 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001292 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001293 /* Process b"", r"" and br"" */
1294 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001295 c = tok_nextc(tok);
1296 if (c == '"' || c == '\'')
1297 goto letter_quote;
1298 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001299 if (c == 'r' || c == 'R') {
1300 c = tok_nextc(tok);
1301 if (c == '"' || c == '\'')
1302 goto letter_quote;
1303 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001304 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001305 if (c >= 128)
1306 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001307 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001308 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001310 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001311 !verify_identifier(tok->start, tok->cur)) {
1312 tok->done = E_IDENTIFIER;
1313 return ERRORTOKEN;
1314 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001315 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 *p_end = tok->cur;
1317 return NAME;
1318 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001319
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 /* Newline */
1321 if (c == '\n') {
1322 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001323 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001324 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001325 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001327 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 return NEWLINE;
1329 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001330
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001331 /* Period or number starting with period? */
1332 if (c == '.') {
1333 c = tok_nextc(tok);
1334 if (isdigit(c)) {
1335 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001336 } else if (c == '.') {
1337 c = tok_nextc(tok);
1338 if (c == '.') {
1339 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001340 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001341 return ELLIPSIS;
1342 } else {
1343 tok_backup(tok, c);
1344 }
1345 tok_backup(tok, '.');
1346 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001347 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001348 }
Georg Brandldde00282007-03-18 19:01:53 +00001349 *p_start = tok->start;
1350 *p_end = tok->cur;
1351 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001352 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001353
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 /* Number */
1355 if (isdigit(c)) {
1356 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001357 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 c = tok_nextc(tok);
1359 if (c == '.')
1360 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001361#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001362 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001363 goto imaginary;
1364#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001366
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001368 c = tok_nextc(tok);
1369 if (!isxdigit(c)) {
1370 tok->done = E_TOKEN;
1371 tok_backup(tok, c);
1372 return ERRORTOKEN;
1373 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 do {
1375 c = tok_nextc(tok);
1376 } while (isxdigit(c));
1377 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001378 else if (c == 'o' || c == 'O') {
1379 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001380 c = tok_nextc(tok);
1381 if (c < '0' || c > '8') {
1382 tok->done = E_TOKEN;
1383 tok_backup(tok, c);
1384 return ERRORTOKEN;
1385 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001386 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001388 } while ('0' <= c && c < '8');
1389 }
1390 else if (c == 'b' || c == 'B') {
1391 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001392 c = tok_nextc(tok);
1393 if (c != '0' && c != '1') {
1394 tok->done = E_TOKEN;
1395 tok_backup(tok, c);
1396 return ERRORTOKEN;
1397 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001398 do {
1399 c = tok_nextc(tok);
1400 } while (c == '0' || c == '1');
1401 }
1402 else {
1403 int nonzero = 0;
1404 /* maybe old-style octal; c is first char of it */
1405 /* in any case, allow '0' as a literal */
1406 while (c == '0')
1407 c = tok_nextc(tok);
1408 while (isdigit(c)) {
1409 nonzero = 1;
1410 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001411 }
1412 if (c == '.')
1413 goto fraction;
1414 else if (c == 'e' || c == 'E')
1415 goto exponent;
1416#ifndef WITHOUT_COMPLEX
1417 else if (c == 'j' || c == 'J')
1418 goto imaginary;
1419#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001420 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001421 tok->done = E_TOKEN;
1422 tok_backup(tok, c);
1423 return ERRORTOKEN;
1424 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 }
1426 }
1427 else {
1428 /* Decimal */
1429 do {
1430 c = tok_nextc(tok);
1431 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001432 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001433 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001434 if (c == '.') {
1435 fraction:
1436 /* Fraction */
1437 do {
1438 c = tok_nextc(tok);
1439 } while (isdigit(c));
1440 }
1441 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001442 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001443 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001445 if (c == '+' || c == '-')
1446 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001447 if (!isdigit(c)) {
1448 tok->done = E_TOKEN;
1449 tok_backup(tok, c);
1450 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001451 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001452 do {
1453 c = tok_nextc(tok);
1454 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001456#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001457 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001458 /* Imaginary part */
1459 imaginary:
1460 c = tok_nextc(tok);
1461#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 }
1463 }
1464 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001465 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 *p_end = tok->cur;
1467 return NUMBER;
1468 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001469
1470 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001471 /* String */
1472 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001473 int quote = c;
1474 int quote_size = 1; /* 1 or 3 */
1475 int end_quote_size = 0;
1476
1477 /* Find the quote size and start of string */
1478 c = tok_nextc(tok);
1479 if (c == quote) {
1480 c = tok_nextc(tok);
1481 if (c == quote)
1482 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001483 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001484 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001485 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001486 if (c != quote)
1487 tok_backup(tok, c);
1488
1489 /* Get rest of string */
1490 while (end_quote_size != quote_size) {
1491 c = tok_nextc(tok);
1492 if (c == EOF) {
1493 if (quote_size == 3)
1494 tok->done = E_EOFS;
1495 else
1496 tok->done = E_EOLS;
1497 tok->cur = tok->inp;
1498 return ERRORTOKEN;
1499 }
1500 if (quote_size == 1 && c == '\n') {
1501 tok->done = E_EOLS;
1502 tok->cur = tok->inp;
1503 return ERRORTOKEN;
1504 }
1505 if (c == quote)
1506 end_quote_size += 1;
1507 else {
1508 end_quote_size = 0;
1509 if (c == '\\')
1510 c = tok_nextc(tok); /* skip escaped char */
1511 }
1512 }
1513
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001514 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001515 *p_end = tok->cur;
1516 return STRING;
1517 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001518
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001519 /* Line continuation */
1520 if (c == '\\') {
1521 c = tok_nextc(tok);
1522 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001523 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001524 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001525 return ERRORTOKEN;
1526 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001527 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001528 goto again; /* Read next line */
1529 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530
Guido van Rossumfbab9051991-10-20 20:25:03 +00001531 /* Check for two-character token */
1532 {
1533 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001534 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001535 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001536 int c3 = tok_nextc(tok);
1537 int token3 = PyToken_ThreeChars(c, c2, c3);
1538 if (token3 != OP) {
1539 token = token3;
1540 } else {
1541 tok_backup(tok, c3);
1542 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001543 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001544 *p_end = tok->cur;
1545 return token;
1546 }
1547 tok_backup(tok, c2);
1548 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001550 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001551 switch (c) {
1552 case '(':
1553 case '[':
1554 case '{':
1555 tok->level++;
1556 break;
1557 case ')':
1558 case ']':
1559 case '}':
1560 tok->level--;
1561 break;
1562 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001563
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001564 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001565 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001566 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001567 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001568}
1569
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001570int
1571PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1572{
1573 int result = tok_get(tok, p_start, p_end);
1574 if (tok->decoding_erred) {
1575 result = ERRORTOKEN;
1576 tok->done = E_DECODE;
1577 }
1578 return result;
1579}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001580
Thomas Wouters89d996e2007-09-08 17:39:28 +00001581/* This function is only called from parsetok. However, it cannot live
1582 there, as it must be empty for PGEN, and we can check for PGEN only
1583 in this file. */
1584
1585#ifdef PGEN
1586char*
1587PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1588{
1589 return NULL;
1590}
1591#else
1592static PyObject *
1593dec_utf8(const char *enc, const char *text, size_t len) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001594 PyObject *ret = NULL;
Thomas Wouters89d996e2007-09-08 17:39:28 +00001595 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1596 if (unicode_text) {
1597 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1598 Py_DECREF(unicode_text);
1599 }
1600 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001601 PyErr_Clear();
1602 }
1603 else {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001604 assert(PyString_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001605 }
1606 return ret;
1607}
1608
1609char *
1610PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1611{
1612 char *text = NULL;
1613 if (tok->encoding) {
1614 /* convert source to original encondig */
1615 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1616 if (lineobj != NULL) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001617 int linelen = PyString_GET_SIZE(lineobj);
1618 const char *line = PyString_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001619 text = PyObject_MALLOC(linelen + 1);
1620 if (text != NULL && line != NULL) {
1621 if (linelen)
1622 strncpy(text, line, linelen);
1623 text[linelen] = '\0';
1624 }
1625 Py_DECREF(lineobj);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001626
Thomas Wouters89d996e2007-09-08 17:39:28 +00001627 /* adjust error offset */
1628 if (*offset > 1) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00001629 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001630 tok->buf,
1631 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001632 if (offsetobj) {
Christian Heimes90aa7642007-12-19 02:45:37 +00001633 *offset = 1 + Py_SIZE(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001634 Py_DECREF(offsetobj);
1635 }
1636 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001637
Thomas Wouters89d996e2007-09-08 17:39:28 +00001638 }
1639 }
1640 return text;
1641
1642}
1643#endif
1644
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001645/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001646
1647 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001648 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001649 should be assumed to be PyUnicode_GetDefaultEncoding()).
1650
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001651 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1652 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001653*/
1654char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001655PyTokenizer_FindEncoding(int fd)
1656{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001657 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001658 FILE *fp;
1659 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001660
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001661 fd = dup(fd);
1662 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001663 return NULL;
1664 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001665 fp = fdopen(fd, "r");
1666 if (fp == NULL) {
1667 return NULL;
1668 }
1669 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1670 if (tok == NULL) {
1671 fclose(fp);
1672 return NULL;
1673 }
1674 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001675 PyTokenizer_Get(tok, &p_start, &p_end);
1676 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001677 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001678 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001679 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001680 strcpy(encoding, tok->encoding);
1681 }
1682 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001683 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001684}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001685
Guido van Rossum408027e1996-12-30 16:17:54 +00001686#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001687
1688void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001689tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001690{
Guido van Rossum86bea461997-04-29 21:03:06 +00001691 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001692 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1693 printf("(%.*s)", (int)(end - start), start);
1694}
1695
1696#endif