blob: d60b25694aad0f7dd9af4ade46b688bad9d216b2 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000108 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 tok->tabsize = TABSIZE;
110 tok->indent = 0;
111 tok->indstack[0] = 0;
112 tok->atbol = 1;
113 tok->pendin = 0;
114 tok->prompt = tok->nextprompt = NULL;
115 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000116 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000117 tok->filename = NULL;
118 tok->altwarning = 0;
119 tok->alterror = 0;
120 tok->alttabsize = 1;
121 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122 tok->decoding_state = 0;
123 tok->decoding_erred = 0;
124 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000125 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000126 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000127#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000128 tok->decoding_readline = NULL;
129 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000130#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131 return tok;
132}
133
Benjamin Petersone36199b2009-11-12 23:39:44 +0000134static char *
135new_string(const char *s, Py_ssize_t len)
136{
137 char* result = (char *)PyMem_MALLOC(len + 1);
138 if (result != NULL) {
139 memcpy(result, s, len);
140 result[len] = '\0';
141 }
142 return result;
143}
144
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145#ifdef PGEN
146
147static char *
148decoding_fgets(char *s, int size, struct tok_state *tok)
149{
150 return fgets(s, size, tok->fp);
151}
152
153static int
154decoding_feof(struct tok_state *tok)
155{
156 return feof(tok->fp);
157}
158
Benjamin Petersone36199b2009-11-12 23:39:44 +0000159static char *
160decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161{
Benjamin Petersone36199b2009-11-12 23:39:44 +0000162 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165#else /* PGEN */
166
167static char *
168error_ret(struct tok_state *tok) /* XXX */
169{
170 tok->decoding_erred = 1;
171 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000172 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173 tok->buf = NULL;
174 return NULL; /* as if it were EOF */
175}
176
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177
178static char *
179get_normal_name(char *s) /* for utf-8 and latin-1 */
180{
181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
Victor Stinnerd23d3932010-03-02 23:20:02 +0000315 int ch1, ch2, ch3;
316 ch1 = get_char(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000317 tok->decoding_state = 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000318 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000319 return 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000320 } else if (ch1 == 0xEF) {
321 ch2 = get_char(tok);
322 if (ch2 != 0xBB) {
323 unget_char(ch2, tok);
324 unget_char(ch1, tok);
325 return 1;
326 }
327 ch3 = get_char(tok);
328 if (ch3 != 0xBF) {
329 unget_char(ch3, tok);
330 unget_char(ch2, tok);
331 unget_char(ch1, tok);
332 return 1;
333 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000334#if 0
335 /* Disable support for UTF-16 BOMs until a decision
336 is made whether this needs to be supported. */
Victor Stinnerd23d3932010-03-02 23:20:02 +0000337 } else if (ch1 == 0xFE) {
338 ch2 = get_char(tok);
339 if (ch2 != 0xFF) {
340 unget_char(ch2, tok);
341 unget_char(ch1, tok);
342 return 1;
343 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000344 if (!set_readline(tok, "utf-16-be"))
345 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 tok->decoding_state = -1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000347 } else if (ch1 == 0xFF) {
348 ch2 = get_char(tok);
349 if (ch2 != 0xFE) {
350 unget_char(ch2, tok);
351 unget_char(ch1, tok);
352 return 1;
353 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000354 if (!set_readline(tok, "utf-16-le"))
355 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356 tok->decoding_state = -1;
357#endif
358 } else {
Victor Stinnerd23d3932010-03-02 23:20:02 +0000359 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000360 return 1;
361 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000362 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000363 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
365 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366}
367
368/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000370
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 On entry, tok->decoding_buffer will be one of:
372 1) NULL: need to call tok->decoding_readline to get a new line
373 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
374 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000375 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 (in the s buffer) to copy entire contents of the line read
377 by tok->decoding_readline. tok->decoding_buffer has the overflow.
378 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000379 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 reached): see tok_nextc and its calls to decoding_fgets.
381*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382
383static char *
384fp_readl(char *s, int size, struct tok_state *tok)
385{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000386#ifndef Py_USING_UNICODE
387 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000388 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000389 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000390#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000394 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395
396 /* Ask for one less byte so we can terminate it */
397 assert(size > 0);
398 size--;
399
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000400 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000402 if (buf == NULL)
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404 } else {
405 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000406 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000408 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 if (utf8 == NULL) {
410 utf8 = PyUnicode_AsUTF8String(buf);
411 Py_DECREF(buf);
412 if (utf8 == NULL)
413 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000415 str = PyString_AsString(utf8);
416 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000417 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000418 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000419 if (tok->decoding_buffer == NULL) {
420 Py_DECREF(utf8);
421 return error_ret(tok);
422 }
423 utf8len = size;
424 }
425 memcpy(s, str, utf8len);
426 s[utf8len] = '\0';
427 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000428 if (utf8len == 0)
429 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000430 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000431#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000432}
433
434/* Set the readline function for TOK to a StreamReader's
435 readline function. The StreamReader is named ENC.
436
437 This function is called from check_bom and check_coding_spec.
438
439 ENC is usually identical to the future value of tok->encoding,
440 except for the (currently unsupported) case of UTF-16.
441
442 Return 1 on success, 0 on failure. */
443
444static int
445fp_setreadl(struct tok_state *tok, const char* enc)
446{
447 PyObject *reader, *stream, *readline;
448
Martin v. Löwis95292d62002-12-11 14:04:59 +0000449 /* XXX: constify filename argument. */
450 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000451 if (stream == NULL)
452 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453
454 reader = PyCodec_StreamReader(enc, stream, NULL);
455 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000456 if (reader == NULL)
457 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458
459 readline = PyObject_GetAttrString(reader, "readline");
460 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000461 if (readline == NULL)
462 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
464 tok->decoding_readline = readline;
465 return 1;
466}
467
468/* Fetch the next byte from TOK. */
469
470static int fp_getc(struct tok_state *tok) {
471 return getc(tok->fp);
472}
473
474/* Unfetch the last byte back into TOK. */
475
476static void fp_ungetc(int c, struct tok_state *tok) {
477 ungetc(c, tok->fp);
478}
479
480/* Read a line of input from TOK. Determine encoding
481 if necessary. */
482
483static char *
484decoding_fgets(char *s, int size, struct tok_state *tok)
485{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000486 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000487 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000488 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000489 if (tok->decoding_state < 0) {
490 /* We already have a codec associated with
491 this input. */
492 line = fp_readl(s, size, tok);
493 break;
494 } else if (tok->decoding_state > 0) {
495 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000496 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000497 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 break;
499 } else {
500 /* We have not yet determined the encoding.
501 If an encoding is found, use the file-pointer
502 reader functions from now on. */
503 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
504 return error_ret(tok);
505 assert(tok->decoding_state != 0);
506 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000507 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000508 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
509 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
510 return error_ret(tok);
511 }
512 }
513#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000514 /* The default encoding is ASCII, so make sure we don't have any
515 non-ASCII bytes in it. */
516 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000518 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000519 if (*c > 127) {
520 badchar = *c;
521 break;
522 }
523 }
524 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000525 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000526 /* Need to add 1 to the line number, since this line
527 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000528 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000529 "Non-ASCII character '\\x%.2x' "
530 "in file %.200s on line %i, "
531 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000532 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000533 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000534 PyErr_SetString(PyExc_SyntaxError, buf);
535 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000536 }
537#endif
538 return line;
539}
540
541static int
542decoding_feof(struct tok_state *tok)
543{
544 if (tok->decoding_state >= 0) {
545 return feof(tok->fp);
546 } else {
547 PyObject* buf = tok->decoding_buffer;
548 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000549 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 if (buf == NULL) {
551 error_ret(tok);
552 return 1;
553 } else {
554 tok->decoding_buffer = buf;
555 }
556 }
557 return PyObject_Length(buf) == 0;
558 }
559}
560
561/* Fetch a byte from TOK, using the string buffer. */
562
Tim Petersc9d78aa2006-03-26 23:27:58 +0000563static int
564buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000565 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000566}
567
568/* Unfetch a byte from TOK, using the string buffer. */
569
Tim Petersc9d78aa2006-03-26 23:27:58 +0000570static void
571buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000573 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574}
575
576/* Set the readline function for TOK to ENC. For the string-based
577 tokenizer, this means to just record the encoding. */
578
Tim Petersc9d78aa2006-03-26 23:27:58 +0000579static int
580buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581 tok->enc = enc;
582 return 1;
583}
584
585/* Return a UTF-8 encoding Python string object from the
586 C byte string STR, which is encoded with ENC. */
587
Martin v. Löwis019934b2002-08-07 12:33:18 +0000588#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589static PyObject *
590translate_into_utf8(const char* str, const char* enc) {
591 PyObject *utf8;
592 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
593 if (buf == NULL)
594 return NULL;
595 utf8 = PyUnicode_AsUTF8String(buf);
596 Py_DECREF(buf);
597 return utf8;
598}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000599#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000600
Benjamin Petersone36199b2009-11-12 23:39:44 +0000601
602static char *
603translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000604 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000605 char *buf, *current;
Benjamin Peterson42d63842009-12-06 17:37:48 +0000606 char c = '\0';
607 buf = PyMem_MALLOC(needed_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000608 if (buf == NULL) {
609 tok->done = E_NOMEM;
610 return NULL;
611 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000612 for (current = buf; *s; s++, current++) {
613 c = *s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000614 if (skip_next_lf) {
615 skip_next_lf = 0;
616 if (c == '\n') {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000617 c = *++s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000618 if (!c)
619 break;
620 }
621 }
622 if (c == '\r') {
623 skip_next_lf = 1;
624 c = '\n';
625 }
626 *current = c;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000627 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000628 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersone36199b2009-11-12 23:39:44 +0000629 there isn't one already. */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000630 if (exec_input && c != '\n') {
Benjamin Petersone36199b2009-11-12 23:39:44 +0000631 *current = '\n';
632 current++;
633 }
634 *current = '\0';
Benjamin Peterson42d63842009-12-06 17:37:48 +0000635 final_length = current - buf + 1;
636 if (final_length < needed_length && final_length)
Benjamin Petersone36199b2009-11-12 23:39:44 +0000637 /* should never fail */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000638 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000639 return buf;
640}
641
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642/* Decode a byte string STR for use as the buffer of TOK.
643 Look for encoding declarations inside STR, and record them
644 inside TOK. */
645
646static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000647decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648{
649 PyObject* utf8 = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000650 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000652 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653 int lineno = 0;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000654 tok->input = str = translate_newlines(input, single, tok);
655 if (str == NULL)
656 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000657 tok->enc = NULL;
658 tok->str = str;
659 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000660 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000662 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000663#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664 if (tok->enc != NULL) {
665 utf8 = translate_into_utf8(str, tok->enc);
666 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000667 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000668 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000670#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000671 for (s = str;; s++) {
672 if (*s == '\0') break;
673 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000674 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000675 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000676 lineno++;
677 if (lineno == 2) break;
678 }
679 }
680 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000681 /* need to check line 1 and 2 separately since check_coding_spec
682 assumes a single line as input */
683 if (newl[0]) {
684 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
685 return error_ret(tok);
686 if (tok->enc == NULL && newl[1]) {
687 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
688 tok, buf_setreadl))
689 return error_ret(tok);
690 }
691 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000692#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000693 if (tok->enc != NULL) {
694 assert(utf8 == NULL);
695 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000696 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000697 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000698 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000700#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000701 assert(tok->decoding_buffer == NULL);
702 tok->decoding_buffer = utf8; /* CAUTION */
703 return str;
704}
705
706#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000707
708/* Set up tokenizer for string */
709
710struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000711PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712{
713 struct tok_state *tok = tok_new();
714 if (tok == NULL)
715 return NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000716 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000717 if (str == NULL) {
718 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000720 }
721
Martin v. Löwis95292d62002-12-11 14:04:59 +0000722 /* XXX: constify members. */
723 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724 return tok;
725}
726
727
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000728/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729
730struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000731PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732{
733 struct tok_state *tok = tok_new();
734 if (tok == NULL)
735 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000736 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000737 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000738 return NULL;
739 }
740 tok->cur = tok->inp = tok->buf;
741 tok->end = tok->buf + BUFSIZ;
742 tok->fp = fp;
743 tok->prompt = ps1;
744 tok->nextprompt = ps2;
745 return tok;
746}
747
748
749/* Free a tok_state structure */
750
751void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000752PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000754 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000755 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000756#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000757 Py_XDECREF(tok->decoding_readline);
758 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000759#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000761 PyMem_FREE(tok->buf);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000762 if (tok->input)
763 PyMem_FREE((char *)tok->input);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000764 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765}
766
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000767#if !defined(PGEN) && defined(Py_USING_UNICODE)
768static int
769tok_stdin_decode(struct tok_state *tok, char **inp)
770{
771 PyObject *enc, *sysstdin, *decoded, *utf8;
772 const char *encoding;
773 char *converted;
774
775 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
776 return 0;
777 sysstdin = PySys_GetObject("stdin");
778 if (sysstdin == NULL || !PyFile_Check(sysstdin))
779 return 0;
780
781 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000782 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000783 return 0;
784 Py_INCREF(enc);
785
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000786 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000787 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
788 if (decoded == NULL)
789 goto error_clear;
790
791 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
792 Py_DECREF(decoded);
793 if (utf8 == NULL)
794 goto error_clear;
795
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000796 assert(PyString_Check(utf8));
797 converted = new_string(PyString_AS_STRING(utf8),
798 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000799 Py_DECREF(utf8);
800 if (converted == NULL)
801 goto error_nomem;
802
Neal Norwitz08062d62006-04-11 08:19:15 +0000803 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000804 *inp = converted;
805 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000806 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000807 tok->encoding = new_string(encoding, strlen(encoding));
808 if (tok->encoding == NULL)
809 goto error_nomem;
810
811 Py_DECREF(enc);
812 return 0;
813
814error_nomem:
815 Py_DECREF(enc);
816 tok->done = E_NOMEM;
817 return -1;
818
819error_clear:
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000820 Py_DECREF(enc);
Victor Stinner66644262010-03-10 22:30:19 +0000821 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
822 tok->done = E_ERROR;
823 return -1;
824 }
825 /* Fallback to iso-8859-1: for backward compatibility */
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000826 PyErr_Clear();
827 return 0;
828}
829#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000830
831/* Get next char, updating state; error code goes into tok->done */
832
833static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000834tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000837 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000838 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000839 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000840 if (tok->done != E_OK)
841 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000843 char *end = strchr(tok->inp, '\n');
844 if (end != NULL)
845 end++;
846 else {
847 end = strchr(tok->inp, '\0');
848 if (end == tok->inp) {
849 tok->done = E_EOF;
850 return EOF;
851 }
852 }
853 if (tok->start == NULL)
854 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000855 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000856 tok->lineno++;
857 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000858 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000861 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862 if (tok->nextprompt != NULL)
863 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000864 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000866 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000867 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868 tok->done = E_EOF;
869 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000870#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000871 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000872 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000873#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000875 size_t start = tok->start - tok->buf;
876 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000877 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000878 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000879 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->lineno++;
881 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000882 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000883 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000884 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000885 tok->done = E_NOMEM;
886 return EOF;
887 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 tok->buf = buf;
889 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000890 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000891 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000892 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000893 tok->inp = tok->buf + newlen;
894 tok->end = tok->inp + 1;
895 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000897 else {
898 tok->lineno++;
899 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000900 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000901 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000902 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000903 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000904 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000905 tok->inp = strchr(tok->buf, '\0');
906 tok->end = tok->inp + 1;
907 }
908 }
909 else {
910 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000911 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000912 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000913 if (tok->start == NULL) {
914 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000915 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000916 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000917 if (tok->buf == NULL) {
918 tok->done = E_NOMEM;
919 return EOF;
920 }
921 tok->end = tok->buf + BUFSIZ;
922 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000923 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
924 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000925 tok->done = E_EOF;
926 done = 1;
927 }
928 else {
929 tok->done = E_OK;
930 tok->inp = strchr(tok->buf, '\0');
931 done = tok->inp[-1] == '\n';
932 }
933 }
934 else {
935 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000936 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000937 tok->done = E_EOF;
938 done = 1;
939 }
940 else
941 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000942 }
943 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000944 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000945 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000946 Py_ssize_t curstart = tok->start == NULL ? -1 :
947 tok->start - tok->buf;
948 Py_ssize_t curvalid = tok->inp - tok->buf;
949 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000950 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000951 newbuf = (char *)PyMem_REALLOC(newbuf,
952 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000953 if (newbuf == NULL) {
954 tok->done = E_NOMEM;
955 tok->cur = tok->inp;
956 return EOF;
957 }
958 tok->buf = newbuf;
959 tok->inp = tok->buf + curvalid;
960 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000961 tok->start = curstart < 0 ? NULL :
962 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000963 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000964 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000965 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000966 /* Break out early on decoding
967 errors, as tok->buf will be NULL
968 */
969 if (tok->decoding_erred)
970 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000971 /* Last line does not end in \n,
972 fake one */
973 strcpy(tok->inp, "\n");
974 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000975 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000976 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000977 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000978 if (tok->buf != NULL) {
979 tok->cur = tok->buf + cur;
980 tok->line_start = tok->cur;
981 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000982 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000983 pt = tok->inp - 2;
984 if (pt >= tok->buf && *pt == '\r') {
985 *pt++ = '\n';
986 *pt = '\0';
987 tok->inp = pt;
988 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000989 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990 }
991 if (tok->done != E_OK) {
992 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000993 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000994 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000995 return EOF;
996 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000997 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000998 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000999}
1000
1001
1002/* Back-up one character */
1003
1004static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001005tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006{
1007 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001008 if (--tok->cur < tok->buf)
Benjamin Petersone3383b82009-11-07 01:04:38 +00001009 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001010 if (*tok->cur != c)
1011 *tok->cur = c;
1012 }
1013}
1014
1015
1016/* Return the token corresponding to a single character */
1017
1018int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001019PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001020{
1021 switch (c) {
1022 case '(': return LPAR;
1023 case ')': return RPAR;
1024 case '[': return LSQB;
1025 case ']': return RSQB;
1026 case ':': return COLON;
1027 case ',': return COMMA;
1028 case ';': return SEMI;
1029 case '+': return PLUS;
1030 case '-': return MINUS;
1031 case '*': return STAR;
1032 case '/': return SLASH;
1033 case '|': return VBAR;
1034 case '&': return AMPER;
1035 case '<': return LESS;
1036 case '>': return GREATER;
1037 case '=': return EQUAL;
1038 case '.': return DOT;
1039 case '%': return PERCENT;
1040 case '`': return BACKQUOTE;
1041 case '{': return LBRACE;
1042 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001043 case '^': return CIRCUMFLEX;
1044 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001045 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001046 default: return OP;
1047 }
1048}
1049
1050
Guido van Rossumfbab9051991-10-20 20:25:03 +00001051int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001052PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001053{
1054 switch (c1) {
1055 case '=':
1056 switch (c2) {
1057 case '=': return EQEQUAL;
1058 }
1059 break;
1060 case '!':
1061 switch (c2) {
1062 case '=': return NOTEQUAL;
1063 }
1064 break;
1065 case '<':
1066 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001067 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001068 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001069 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001070 }
1071 break;
1072 case '>':
1073 switch (c2) {
1074 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001075 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001076 }
1077 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001078 case '+':
1079 switch (c2) {
1080 case '=': return PLUSEQUAL;
1081 }
1082 break;
1083 case '-':
1084 switch (c2) {
1085 case '=': return MINEQUAL;
1086 }
1087 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001088 case '*':
1089 switch (c2) {
1090 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001091 case '=': return STAREQUAL;
1092 }
1093 break;
1094 case '/':
1095 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001096 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001097 case '=': return SLASHEQUAL;
1098 }
1099 break;
1100 case '|':
1101 switch (c2) {
1102 case '=': return VBAREQUAL;
1103 }
1104 break;
1105 case '%':
1106 switch (c2) {
1107 case '=': return PERCENTEQUAL;
1108 }
1109 break;
1110 case '&':
1111 switch (c2) {
1112 case '=': return AMPEREQUAL;
1113 }
1114 break;
1115 case '^':
1116 switch (c2) {
1117 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001118 }
1119 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001120 }
1121 return OP;
1122}
1123
Thomas Wouters434d0822000-08-24 20:11:32 +00001124int
1125PyToken_ThreeChars(int c1, int c2, int c3)
1126{
1127 switch (c1) {
1128 case '<':
1129 switch (c2) {
1130 case '<':
1131 switch (c3) {
1132 case '=':
1133 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001134 }
1135 break;
1136 }
1137 break;
1138 case '>':
1139 switch (c2) {
1140 case '>':
1141 switch (c3) {
1142 case '=':
1143 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001144 }
1145 break;
1146 }
1147 break;
1148 case '*':
1149 switch (c2) {
1150 case '*':
1151 switch (c3) {
1152 case '=':
1153 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001154 }
1155 break;
1156 }
1157 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001158 case '/':
1159 switch (c2) {
1160 case '/':
1161 switch (c3) {
1162 case '=':
1163 return DOUBLESLASHEQUAL;
1164 }
1165 break;
1166 }
1167 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001168 }
1169 return OP;
1170}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001171
Guido van Rossum926f13a1998-04-09 21:38:06 +00001172static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001173indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001174{
1175 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001176 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001177 tok->cur = tok->inp;
1178 return 1;
1179 }
1180 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001181 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1182 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001183 tok->altwarning = 0;
1184 }
1185 return 0;
1186}
1187
1188
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189/* Get next token, after space stripping etc. */
1190
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001191static int
1192tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193{
1194 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001195 int blankline;
1196
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001197 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001198 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001199 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200 blankline = 0;
1201
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 /* Get indentation level */
1203 if (tok->atbol) {
1204 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001205 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001206 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 for (;;) {
1208 c = tok_nextc(tok);
1209 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001210 col++, altcol++;
1211 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001213 altcol = (altcol/tok->alttabsize + 1)
1214 * tok->alttabsize;
1215 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001216 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001217 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001218 else
1219 break;
1220 }
1221 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001222 if (c == '#' || c == '\n') {
1223 /* Lines with only whitespace and/or comments
1224 shouldn't affect the indentation and are
1225 not passed to the parser as NEWLINE tokens,
1226 except *totally* empty lines in interactive
1227 mode, which signal the end of a command group. */
1228 if (col == 0 && c == '\n' && tok->prompt != NULL)
1229 blankline = 0; /* Let it through */
1230 else
1231 blankline = 1; /* Ignore completely */
1232 /* We can't jump back right here since we still
1233 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001235 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001236 if (col == tok->indstack[tok->indent]) {
1237 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001238 if (altcol != tok->altindstack[tok->indent]) {
1239 if (indenterror(tok))
1240 return ERRORTOKEN;
1241 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001242 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001243 else if (col > tok->indstack[tok->indent]) {
1244 /* Indent -- always one */
1245 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001246 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001247 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001248 return ERRORTOKEN;
1249 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001250 if (altcol <= tok->altindstack[tok->indent]) {
1251 if (indenterror(tok))
1252 return ERRORTOKEN;
1253 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001254 tok->pendin++;
1255 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001256 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001258 else /* col < tok->indstack[tok->indent] */ {
1259 /* Dedent -- any number, must be consistent */
1260 while (tok->indent > 0 &&
1261 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001262 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001263 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001264 }
1265 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001266 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001267 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001268 return ERRORTOKEN;
1269 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270 if (altcol != tok->altindstack[tok->indent]) {
1271 if (indenterror(tok))
1272 return ERRORTOKEN;
1273 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001274 }
1275 }
1276 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001277
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001278 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001279
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 /* Return pending indents/dedents */
1281 if (tok->pendin != 0) {
1282 if (tok->pendin < 0) {
1283 tok->pendin++;
1284 return DEDENT;
1285 }
1286 else {
1287 tok->pendin--;
1288 return INDENT;
1289 }
1290 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001291
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001293 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 /* Skip spaces */
1295 do {
1296 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001297 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001298
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001300 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001301
Guido van Rossumab5ca152000-03-31 00:52:27 +00001302 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001303 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001304 static char *tabforms[] = {
1305 "tab-width:", /* Emacs */
1306 ":tabstop=", /* vim, full form */
1307 ":ts=", /* vim, abbreviated form */
1308 "set tabsize=", /* will vi never die? */
1309 /* more templates can be added here to support other editors */
1310 };
1311 char cbuf[80];
1312 char *tp, **cp;
1313 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001315 *tp++ = c = tok_nextc(tok);
1316 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001317 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001318 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001319 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001320 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1321 cp++) {
1322 if ((tp = strstr(cbuf, *cp))) {
1323 int newsize = atoi(tp + strlen(*cp));
1324
1325 if (newsize >= 1 && newsize <= 40) {
1326 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001327 if (Py_VerboseFlag)
1328 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001329 "Tab size set to %d\n",
1330 newsize);
1331 }
1332 }
1333 }
1334 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001337
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001339 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001340 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001341 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001342
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001343 /* Identifier (most frequent token!) */
1344 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001345 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001346 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001347 case 'b':
1348 case 'B':
1349 c = tok_nextc(tok);
1350 if (c == 'r' || c == 'R')
1351 c = tok_nextc(tok);
1352 if (c == '"' || c == '\'')
1353 goto letter_quote;
1354 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001355 case 'r':
1356 case 'R':
1357 c = tok_nextc(tok);
1358 if (c == '"' || c == '\'')
1359 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001360 break;
1361 case 'u':
1362 case 'U':
1363 c = tok_nextc(tok);
1364 if (c == 'r' || c == 'R')
1365 c = tok_nextc(tok);
1366 if (c == '"' || c == '\'')
1367 goto letter_quote;
1368 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001369 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001370 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001372 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001373 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001374 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 *p_end = tok->cur;
1376 return NAME;
1377 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001378
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 /* Newline */
1380 if (c == '\n') {
1381 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001382 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001383 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001386 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 return NEWLINE;
1388 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001389
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001390 /* Period or number starting with period? */
1391 if (c == '.') {
1392 c = tok_nextc(tok);
1393 if (isdigit(c)) {
1394 goto fraction;
1395 }
1396 else {
1397 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001399 *p_end = tok->cur;
1400 return DOT;
1401 }
1402 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001403
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 /* Number */
1405 if (isdigit(c)) {
1406 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001407 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 c = tok_nextc(tok);
1409 if (c == '.')
1410 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001411#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001412 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001413 goto imaginary;
1414#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001416
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001417 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001418 c = tok_nextc(tok);
1419 if (!isxdigit(c)) {
1420 tok->done = E_TOKEN;
1421 tok_backup(tok, c);
1422 return ERRORTOKEN;
1423 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 do {
1425 c = tok_nextc(tok);
1426 } while (isxdigit(c));
1427 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001428 else if (c == 'o' || c == 'O') {
1429 /* Octal */
1430 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001431 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001432 tok->done = E_TOKEN;
1433 tok_backup(tok, c);
1434 return ERRORTOKEN;
1435 }
1436 do {
1437 c = tok_nextc(tok);
1438 } while ('0' <= c && c < '8');
1439 }
1440 else if (c == 'b' || c == 'B') {
1441 /* Binary */
1442 c = tok_nextc(tok);
1443 if (c != '0' && c != '1') {
1444 tok->done = E_TOKEN;
1445 tok_backup(tok, c);
1446 return ERRORTOKEN;
1447 }
1448 do {
1449 c = tok_nextc(tok);
1450 } while (c == '0' || c == '1');
1451 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001452 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001453 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001454 /* Octal; c is first char of it */
1455 /* There's no 'isoctdigit' macro, sigh */
1456 while ('0' <= c && c < '8') {
1457 c = tok_nextc(tok);
1458 }
Tim Petersd507dab2001-08-30 20:51:59 +00001459 if (isdigit(c)) {
1460 found_decimal = 1;
1461 do {
1462 c = tok_nextc(tok);
1463 } while (isdigit(c));
1464 }
1465 if (c == '.')
1466 goto fraction;
1467 else if (c == 'e' || c == 'E')
1468 goto exponent;
1469#ifndef WITHOUT_COMPLEX
1470 else if (c == 'j' || c == 'J')
1471 goto imaginary;
1472#endif
1473 else if (found_decimal) {
1474 tok->done = E_TOKEN;
1475 tok_backup(tok, c);
1476 return ERRORTOKEN;
1477 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001479 if (c == 'l' || c == 'L')
1480 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 }
1482 else {
1483 /* Decimal */
1484 do {
1485 c = tok_nextc(tok);
1486 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001487 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001488 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001489 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001490 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001491 if (c == '.') {
1492 fraction:
1493 /* Fraction */
1494 do {
1495 c = tok_nextc(tok);
1496 } while (isdigit(c));
1497 }
1498 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001499 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001500 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001501 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001502 if (c == '+' || c == '-')
1503 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001504 if (!isdigit(c)) {
1505 tok->done = E_TOKEN;
1506 tok_backup(tok, c);
1507 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001508 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001509 do {
1510 c = tok_nextc(tok);
1511 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001512 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001513#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001514 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001515 /* Imaginary part */
1516 imaginary:
1517 c = tok_nextc(tok);
1518#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001519 }
1520 }
1521 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001522 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523 *p_end = tok->cur;
1524 return NUMBER;
1525 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001526
1527 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001528 /* String */
1529 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001530 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001531 int quote = c;
1532 int triple = 0;
1533 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001534 for (;;) {
1535 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001536 if (c == '\n') {
1537 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001538 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001539 tok_backup(tok, c);
1540 return ERRORTOKEN;
1541 }
1542 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001543 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001544 }
1545 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001546 if (triple)
1547 tok->done = E_EOFS;
1548 else
1549 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001550 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001551 return ERRORTOKEN;
1552 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001553 else if (c == quote) {
1554 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001555 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001556 c = tok_nextc(tok);
1557 if (c == quote) {
1558 triple = 1;
1559 tripcount = 0;
1560 continue;
1561 }
1562 tok_backup(tok, c);
1563 }
1564 if (!triple || tripcount == 3)
1565 break;
1566 }
1567 else if (c == '\\') {
1568 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001569 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001570 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001571 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001572 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001573 return ERRORTOKEN;
1574 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001575 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001576 else
1577 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001578 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001579 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001580 *p_end = tok->cur;
1581 return STRING;
1582 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001583
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001584 /* Line continuation */
1585 if (c == '\\') {
1586 c = tok_nextc(tok);
1587 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001588 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001589 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001590 return ERRORTOKEN;
1591 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001592 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001593 goto again; /* Read next line */
1594 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001595
Guido van Rossumfbab9051991-10-20 20:25:03 +00001596 /* Check for two-character token */
1597 {
1598 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001599 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001600#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001601 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001602 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001603 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001604 tok->filename, tok->lineno,
1605 NULL, NULL)) {
1606 return ERRORTOKEN;
1607 }
1608 }
1609#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001610 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001611 int c3 = tok_nextc(tok);
1612 int token3 = PyToken_ThreeChars(c, c2, c3);
1613 if (token3 != OP) {
1614 token = token3;
1615 } else {
1616 tok_backup(tok, c3);
1617 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001618 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001619 *p_end = tok->cur;
1620 return token;
1621 }
1622 tok_backup(tok, c2);
1623 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001624
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001625 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001626 switch (c) {
1627 case '(':
1628 case '[':
1629 case '{':
1630 tok->level++;
1631 break;
1632 case ')':
1633 case ']':
1634 case '}':
1635 tok->level--;
1636 break;
1637 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001638
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001639 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001640 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001641 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001642 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001643}
1644
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001645int
1646PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1647{
1648 int result = tok_get(tok, p_start, p_end);
1649 if (tok->decoding_erred) {
1650 result = ERRORTOKEN;
1651 tok->done = E_DECODE;
1652 }
1653 return result;
1654}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655
Martin v. Löwisa5136192007-09-04 14:19:28 +00001656/* This function is only called from parsetok. However, it cannot live
1657 there, as it must be empty for PGEN, and we can check for PGEN only
1658 in this file. */
1659
Christian Heimes082c9b02008-01-23 14:20:50 +00001660#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001661char*
1662PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1663{
1664 return NULL;
1665}
1666#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001667#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001668static PyObject *
1669dec_utf8(const char *enc, const char *text, size_t len) {
1670 PyObject *ret = NULL;
1671 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1672 if (unicode_text) {
1673 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1674 Py_DECREF(unicode_text);
1675 }
1676 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001677 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001678 }
1679 return ret;
1680}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001681char *
1682PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1683{
1684 char *text = NULL;
1685 if (tok->encoding) {
1686 /* convert source to original encondig */
1687 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1688 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001689 int linelen = PyString_Size(lineobj);
1690 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001691 text = PyObject_MALLOC(linelen + 1);
1692 if (text != NULL && line != NULL) {
1693 if (linelen)
1694 strncpy(text, line, linelen);
1695 text[linelen] = '\0';
1696 }
1697 Py_DECREF(lineobj);
1698
1699 /* adjust error offset */
1700 if (*offset > 1) {
1701 PyObject *offsetobj = dec_utf8(tok->encoding,
1702 tok->buf, *offset-1);
1703 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001704 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001705 Py_DECREF(offsetobj);
1706 }
1707 }
1708
1709 }
1710 }
1711 return text;
1712
1713}
Georg Brandl76b30d12008-01-07 18:41:34 +00001714#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001715#endif
1716
Martin v. Löwisa5136192007-09-04 14:19:28 +00001717
Guido van Rossum408027e1996-12-30 16:17:54 +00001718#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001719
1720void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001721tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001722{
Guido van Rossum86bea461997-04-29 21:03:06 +00001723 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001724 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1725 printf("(%.*s)", (int)(end - start), start);
1726}
1727
1728#endif