blob: 495182fc8611f380a8edad2345dffbbc6a9a09b3 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000095/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000098tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000099{
Anthony Baxter11490022006-04-11 05:39:14 +0000100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000102 if (tok == NULL)
103 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000105 tok->done = E_OK;
106 tok->fp = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000107 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000115 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000124 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000125 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000126#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000129#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000130 return tok;
131}
132
Benjamin Petersone36199b2009-11-12 23:39:44 +0000133static char *
134new_string(const char *s, Py_ssize_t len)
135{
136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
142}
143
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
149 return fgets(s, size, tok->fp);
150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
155 return feof(tok->fp);
156}
157
Benjamin Petersone36199b2009-11-12 23:39:44 +0000158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160{
Benjamin Petersone36199b2009-11-12 23:39:44 +0000161 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000171 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
174}
175
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176
177static char *
178get_normal_name(char *s) /* for utf-8 and latin-1 */
179{
180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000209get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000210{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000211 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
Benjamin Peterson88623d72010-04-03 23:03:35 +0000232 while (Py_ISALNUM(t[0]) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000233 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000240 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000241 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 int set_readline(struct tok_state *, const char *))
258{
Tim Peters17db21f2002-09-03 15:39:58 +0000259 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000261
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000265 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000274#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000281 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000287 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000288#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000301 return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313{
Victor Stinnerd23d3932010-03-02 23:20:02 +0000314 int ch1, ch2, ch3;
315 ch1 = get_char(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000316 tok->decoding_state = 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000317 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318 return 1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000319 } else if (ch1 == 0xEF) {
320 ch2 = get_char(tok);
321 if (ch2 != 0xBB) {
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
326 ch3 = get_char(tok);
327 if (ch3 != 0xBF) {
328 unget_char(ch3, tok);
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
Victor Stinnerd23d3932010-03-02 23:20:02 +0000336 } else if (ch1 == 0xFE) {
337 ch2 = get_char(tok);
338 if (ch2 != 0xFF) {
339 unget_char(ch2, tok);
340 unget_char(ch1, tok);
341 return 1;
342 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000343 if (!set_readline(tok, "utf-16-be"))
344 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000345 tok->decoding_state = -1;
Victor Stinnerd23d3932010-03-02 23:20:02 +0000346 } else if (ch1 == 0xFF) {
347 ch2 = get_char(tok);
348 if (ch2 != 0xFE) {
349 unget_char(ch2, tok);
350 unget_char(ch1, tok);
351 return 1;
352 }
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000353 if (!set_readline(tok, "utf-16-le"))
354 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000355 tok->decoding_state = -1;
356#endif
357 } else {
Victor Stinnerd23d3932010-03-02 23:20:02 +0000358 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000359 return 1;
360 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000361 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000362 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
364 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365}
366
367/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000369
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 On entry, tok->decoding_buffer will be one of:
371 1) NULL: need to call tok->decoding_readline to get a new line
372 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
373 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000374 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375 (in the s buffer) to copy entire contents of the line read
376 by tok->decoding_readline. tok->decoding_buffer has the overflow.
377 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000378 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 reached): see tok_nextc and its calls to decoding_fgets.
380*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381
382static char *
383fp_readl(char *s, int size, struct tok_state *tok)
384{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000385#ifndef Py_USING_UNICODE
386 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000387 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000388 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000389#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000390 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000391 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000392 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000393 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394
395 /* Ask for one less byte so we can terminate it */
396 assert(size > 0);
397 size--;
398
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000400 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000401 if (buf == NULL)
402 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000403 } else {
404 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000405 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000406 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000407 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000408 if (utf8 == NULL) {
409 utf8 = PyUnicode_AsUTF8String(buf);
410 Py_DECREF(buf);
411 if (utf8 == NULL)
412 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000413 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000414 str = PyString_AsString(utf8);
415 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000417 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000418 if (tok->decoding_buffer == NULL) {
419 Py_DECREF(utf8);
420 return error_ret(tok);
421 }
422 utf8len = size;
423 }
424 memcpy(s, str, utf8len);
425 s[utf8len] = '\0';
426 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000427 if (utf8len == 0)
428 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000429 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000430#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000431}
432
433/* Set the readline function for TOK to a StreamReader's
434 readline function. The StreamReader is named ENC.
435
436 This function is called from check_bom and check_coding_spec.
437
438 ENC is usually identical to the future value of tok->encoding,
439 except for the (currently unsupported) case of UTF-16.
440
441 Return 1 on success, 0 on failure. */
442
443static int
444fp_setreadl(struct tok_state *tok, const char* enc)
445{
446 PyObject *reader, *stream, *readline;
447
Martin v. Löwis95292d62002-12-11 14:04:59 +0000448 /* XXX: constify filename argument. */
449 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000450 if (stream == NULL)
451 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000452
453 reader = PyCodec_StreamReader(enc, stream, NULL);
454 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000455 if (reader == NULL)
456 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000457
458 readline = PyObject_GetAttrString(reader, "readline");
459 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000460 if (readline == NULL)
461 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462
463 tok->decoding_readline = readline;
464 return 1;
465}
466
467/* Fetch the next byte from TOK. */
468
469static int fp_getc(struct tok_state *tok) {
470 return getc(tok->fp);
471}
472
473/* Unfetch the last byte back into TOK. */
474
475static void fp_ungetc(int c, struct tok_state *tok) {
476 ungetc(c, tok->fp);
477}
478
479/* Read a line of input from TOK. Determine encoding
480 if necessary. */
481
482static char *
483decoding_fgets(char *s, int size, struct tok_state *tok)
484{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000485 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000486 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000487 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 if (tok->decoding_state < 0) {
489 /* We already have a codec associated with
490 this input. */
491 line = fp_readl(s, size, tok);
492 break;
493 } else if (tok->decoding_state > 0) {
494 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000495 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000497 break;
498 } else {
499 /* We have not yet determined the encoding.
500 If an encoding is found, use the file-pointer
501 reader functions from now on. */
502 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
503 return error_ret(tok);
504 assert(tok->decoding_state != 0);
505 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000506 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
508 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
509 return error_ret(tok);
510 }
511 }
512#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000513 /* The default encoding is ASCII, so make sure we don't have any
514 non-ASCII bytes in it. */
515 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000517 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000518 if (*c > 127) {
519 badchar = *c;
520 break;
521 }
522 }
523 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000524 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000525 /* Need to add 1 to the line number, since this line
526 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000527 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000528 "Non-ASCII character '\\x%.2x' "
529 "in file %.200s on line %i, "
530 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000531 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000532 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000533 PyErr_SetString(PyExc_SyntaxError, buf);
534 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535 }
536#endif
537 return line;
538}
539
540static int
541decoding_feof(struct tok_state *tok)
542{
543 if (tok->decoding_state >= 0) {
544 return feof(tok->fp);
545 } else {
546 PyObject* buf = tok->decoding_buffer;
547 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000548 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549 if (buf == NULL) {
550 error_ret(tok);
551 return 1;
552 } else {
553 tok->decoding_buffer = buf;
554 }
555 }
556 return PyObject_Length(buf) == 0;
557 }
558}
559
560/* Fetch a byte from TOK, using the string buffer. */
561
Tim Petersc9d78aa2006-03-26 23:27:58 +0000562static int
563buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000564 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565}
566
567/* Unfetch a byte from TOK, using the string buffer. */
568
Tim Petersc9d78aa2006-03-26 23:27:58 +0000569static void
570buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000572 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573}
574
575/* Set the readline function for TOK to ENC. For the string-based
576 tokenizer, this means to just record the encoding. */
577
Tim Petersc9d78aa2006-03-26 23:27:58 +0000578static int
579buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 tok->enc = enc;
581 return 1;
582}
583
584/* Return a UTF-8 encoding Python string object from the
585 C byte string STR, which is encoded with ENC. */
586
Martin v. Löwis019934b2002-08-07 12:33:18 +0000587#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588static PyObject *
589translate_into_utf8(const char* str, const char* enc) {
590 PyObject *utf8;
591 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
592 if (buf == NULL)
593 return NULL;
594 utf8 = PyUnicode_AsUTF8String(buf);
595 Py_DECREF(buf);
596 return utf8;
597}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000598#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599
Benjamin Petersone36199b2009-11-12 23:39:44 +0000600
601static char *
602translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000603 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000604 char *buf, *current;
Benjamin Peterson42d63842009-12-06 17:37:48 +0000605 char c = '\0';
606 buf = PyMem_MALLOC(needed_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000607 if (buf == NULL) {
608 tok->done = E_NOMEM;
609 return NULL;
610 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000611 for (current = buf; *s; s++, current++) {
612 c = *s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000613 if (skip_next_lf) {
614 skip_next_lf = 0;
615 if (c == '\n') {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000616 c = *++s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000617 if (!c)
618 break;
619 }
620 }
621 if (c == '\r') {
622 skip_next_lf = 1;
623 c = '\n';
624 }
625 *current = c;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000626 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000627 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersone36199b2009-11-12 23:39:44 +0000628 there isn't one already. */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000629 if (exec_input && c != '\n') {
Benjamin Petersone36199b2009-11-12 23:39:44 +0000630 *current = '\n';
631 current++;
632 }
633 *current = '\0';
Benjamin Peterson42d63842009-12-06 17:37:48 +0000634 final_length = current - buf + 1;
635 if (final_length < needed_length && final_length)
Benjamin Petersone36199b2009-11-12 23:39:44 +0000636 /* should never fail */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000637 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000638 return buf;
639}
640
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641/* Decode a byte string STR for use as the buffer of TOK.
642 Look for encoding declarations inside STR, and record them
643 inside TOK. */
644
645static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000646decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000647{
648 PyObject* utf8 = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000649 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000651 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652 int lineno = 0;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000653 tok->input = str = translate_newlines(input, single, tok);
654 if (str == NULL)
655 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656 tok->enc = NULL;
657 tok->str = str;
658 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000659 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000661 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000662#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000663 if (tok->enc != NULL) {
664 utf8 = translate_into_utf8(str, tok->enc);
665 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000667 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000669#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670 for (s = str;; s++) {
671 if (*s == '\0') break;
672 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000673 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000674 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000675 lineno++;
676 if (lineno == 2) break;
677 }
678 }
679 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000680 /* need to check line 1 and 2 separately since check_coding_spec
681 assumes a single line as input */
682 if (newl[0]) {
683 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
684 return error_ret(tok);
685 if (tok->enc == NULL && newl[1]) {
686 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
687 tok, buf_setreadl))
688 return error_ret(tok);
689 }
690 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000691#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000692 if (tok->enc != NULL) {
693 assert(utf8 == NULL);
694 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000695 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000696 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000697 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000699#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700 assert(tok->decoding_buffer == NULL);
701 tok->decoding_buffer = utf8; /* CAUTION */
702 return str;
703}
704
705#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706
707/* Set up tokenizer for string */
708
709struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000710PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711{
712 struct tok_state *tok = tok_new();
713 if (tok == NULL)
714 return NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000715 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716 if (str == NULL) {
717 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000718 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000719 }
720
Martin v. Löwis95292d62002-12-11 14:04:59 +0000721 /* XXX: constify members. */
722 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723 return tok;
724}
725
726
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000727/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000728
729struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000730PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731{
732 struct tok_state *tok = tok_new();
733 if (tok == NULL)
734 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000735 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000736 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 return NULL;
738 }
739 tok->cur = tok->inp = tok->buf;
740 tok->end = tok->buf + BUFSIZ;
741 tok->fp = fp;
742 tok->prompt = ps1;
743 tok->nextprompt = ps2;
744 return tok;
745}
746
747
748/* Free a tok_state structure */
749
750void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000751PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000753 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000754 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000755#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000756 Py_XDECREF(tok->decoding_readline);
757 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000758#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000760 PyMem_FREE(tok->buf);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000761 if (tok->input)
762 PyMem_FREE((char *)tok->input);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000763 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764}
765
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000766#if !defined(PGEN) && defined(Py_USING_UNICODE)
767static int
768tok_stdin_decode(struct tok_state *tok, char **inp)
769{
770 PyObject *enc, *sysstdin, *decoded, *utf8;
771 const char *encoding;
772 char *converted;
773
774 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
775 return 0;
776 sysstdin = PySys_GetObject("stdin");
777 if (sysstdin == NULL || !PyFile_Check(sysstdin))
778 return 0;
779
780 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000781 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000782 return 0;
783 Py_INCREF(enc);
784
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000785 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000786 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
787 if (decoded == NULL)
788 goto error_clear;
789
790 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
791 Py_DECREF(decoded);
792 if (utf8 == NULL)
793 goto error_clear;
794
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000795 assert(PyString_Check(utf8));
796 converted = new_string(PyString_AS_STRING(utf8),
797 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000798 Py_DECREF(utf8);
799 if (converted == NULL)
800 goto error_nomem;
801
Neal Norwitz08062d62006-04-11 08:19:15 +0000802 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000803 *inp = converted;
804 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000805 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000806 tok->encoding = new_string(encoding, strlen(encoding));
807 if (tok->encoding == NULL)
808 goto error_nomem;
809
810 Py_DECREF(enc);
811 return 0;
812
813error_nomem:
814 Py_DECREF(enc);
815 tok->done = E_NOMEM;
816 return -1;
817
818error_clear:
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000819 Py_DECREF(enc);
Victor Stinner66644262010-03-10 22:30:19 +0000820 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
821 tok->done = E_ERROR;
822 return -1;
823 }
824 /* Fallback to iso-8859-1: for backward compatibility */
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000825 PyErr_Clear();
826 return 0;
827}
828#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829
830/* Get next char, updating state; error code goes into tok->done */
831
832static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000833tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000834{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000836 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000837 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000838 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000839 if (tok->done != E_OK)
840 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000841 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000842 char *end = strchr(tok->inp, '\n');
843 if (end != NULL)
844 end++;
845 else {
846 end = strchr(tok->inp, '\0');
847 if (end == tok->inp) {
848 tok->done = E_EOF;
849 return EOF;
850 }
851 }
852 if (tok->start == NULL)
853 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000854 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000855 tok->lineno++;
856 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000857 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000860 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861 if (tok->nextprompt != NULL)
862 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000863 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000865 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000866 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000867 tok->done = E_EOF;
868 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000869#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000870 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000871 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000872#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000873 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000874 size_t start = tok->start - tok->buf;
875 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000876 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000877 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000878 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 tok->lineno++;
880 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000881 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000882 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000883 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000884 tok->done = E_NOMEM;
885 return EOF;
886 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000887 tok->buf = buf;
888 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000889 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000890 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000891 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 tok->inp = tok->buf + newlen;
893 tok->end = tok->inp + 1;
894 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000896 else {
897 tok->lineno++;
898 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000899 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000900 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000901 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000902 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000903 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 tok->inp = strchr(tok->buf, '\0');
905 tok->end = tok->inp + 1;
906 }
907 }
908 else {
909 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000910 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000911 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000912 if (tok->start == NULL) {
913 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000914 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000915 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000916 if (tok->buf == NULL) {
917 tok->done = E_NOMEM;
918 return EOF;
919 }
920 tok->end = tok->buf + BUFSIZ;
921 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000922 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
923 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000924 tok->done = E_EOF;
925 done = 1;
926 }
927 else {
928 tok->done = E_OK;
929 tok->inp = strchr(tok->buf, '\0');
930 done = tok->inp[-1] == '\n';
931 }
932 }
933 else {
934 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000935 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000936 tok->done = E_EOF;
937 done = 1;
938 }
939 else
940 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000941 }
942 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000943 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000944 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000945 Py_ssize_t curstart = tok->start == NULL ? -1 :
946 tok->start - tok->buf;
947 Py_ssize_t curvalid = tok->inp - tok->buf;
948 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000949 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000950 newbuf = (char *)PyMem_REALLOC(newbuf,
951 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000952 if (newbuf == NULL) {
953 tok->done = E_NOMEM;
954 tok->cur = tok->inp;
955 return EOF;
956 }
957 tok->buf = newbuf;
958 tok->inp = tok->buf + curvalid;
959 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000960 tok->start = curstart < 0 ? NULL :
961 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000962 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000963 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000964 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000965 /* Break out early on decoding
966 errors, as tok->buf will be NULL
967 */
968 if (tok->decoding_erred)
969 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000970 /* Last line does not end in \n,
971 fake one */
972 strcpy(tok->inp, "\n");
973 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000974 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000975 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000976 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000977 if (tok->buf != NULL) {
978 tok->cur = tok->buf + cur;
979 tok->line_start = tok->cur;
980 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000981 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000982 pt = tok->inp - 2;
983 if (pt >= tok->buf && *pt == '\r') {
984 *pt++ = '\n';
985 *pt = '\0';
986 tok->inp = pt;
987 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000988 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000989 }
990 if (tok->done != E_OK) {
991 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000992 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000993 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000994 return EOF;
995 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000996 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000997 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000998}
999
1000
1001/* Back-up one character */
1002
1003static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001004tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001005{
1006 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001007 if (--tok->cur < tok->buf)
Benjamin Petersone3383b82009-11-07 01:04:38 +00001008 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001009 if (*tok->cur != c)
1010 *tok->cur = c;
1011 }
1012}
1013
1014
1015/* Return the token corresponding to a single character */
1016
1017int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001018PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001019{
1020 switch (c) {
1021 case '(': return LPAR;
1022 case ')': return RPAR;
1023 case '[': return LSQB;
1024 case ']': return RSQB;
1025 case ':': return COLON;
1026 case ',': return COMMA;
1027 case ';': return SEMI;
1028 case '+': return PLUS;
1029 case '-': return MINUS;
1030 case '*': return STAR;
1031 case '/': return SLASH;
1032 case '|': return VBAR;
1033 case '&': return AMPER;
1034 case '<': return LESS;
1035 case '>': return GREATER;
1036 case '=': return EQUAL;
1037 case '.': return DOT;
1038 case '%': return PERCENT;
1039 case '`': return BACKQUOTE;
1040 case '{': return LBRACE;
1041 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001042 case '^': return CIRCUMFLEX;
1043 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001044 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001045 default: return OP;
1046 }
1047}
1048
1049
Guido van Rossumfbab9051991-10-20 20:25:03 +00001050int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001051PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001052{
1053 switch (c1) {
1054 case '=':
1055 switch (c2) {
1056 case '=': return EQEQUAL;
1057 }
1058 break;
1059 case '!':
1060 switch (c2) {
1061 case '=': return NOTEQUAL;
1062 }
1063 break;
1064 case '<':
1065 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001066 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001067 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001068 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001069 }
1070 break;
1071 case '>':
1072 switch (c2) {
1073 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001074 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001075 }
1076 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 case '+':
1078 switch (c2) {
1079 case '=': return PLUSEQUAL;
1080 }
1081 break;
1082 case '-':
1083 switch (c2) {
1084 case '=': return MINEQUAL;
1085 }
1086 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001087 case '*':
1088 switch (c2) {
1089 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001090 case '=': return STAREQUAL;
1091 }
1092 break;
1093 case '/':
1094 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001095 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001096 case '=': return SLASHEQUAL;
1097 }
1098 break;
1099 case '|':
1100 switch (c2) {
1101 case '=': return VBAREQUAL;
1102 }
1103 break;
1104 case '%':
1105 switch (c2) {
1106 case '=': return PERCENTEQUAL;
1107 }
1108 break;
1109 case '&':
1110 switch (c2) {
1111 case '=': return AMPEREQUAL;
1112 }
1113 break;
1114 case '^':
1115 switch (c2) {
1116 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001117 }
1118 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001119 }
1120 return OP;
1121}
1122
Thomas Wouters434d0822000-08-24 20:11:32 +00001123int
1124PyToken_ThreeChars(int c1, int c2, int c3)
1125{
1126 switch (c1) {
1127 case '<':
1128 switch (c2) {
1129 case '<':
1130 switch (c3) {
1131 case '=':
1132 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001133 }
1134 break;
1135 }
1136 break;
1137 case '>':
1138 switch (c2) {
1139 case '>':
1140 switch (c3) {
1141 case '=':
1142 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001143 }
1144 break;
1145 }
1146 break;
1147 case '*':
1148 switch (c2) {
1149 case '*':
1150 switch (c3) {
1151 case '=':
1152 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001153 }
1154 break;
1155 }
1156 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001157 case '/':
1158 switch (c2) {
1159 case '/':
1160 switch (c3) {
1161 case '=':
1162 return DOUBLESLASHEQUAL;
1163 }
1164 break;
1165 }
1166 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001167 }
1168 return OP;
1169}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001170
Guido van Rossum926f13a1998-04-09 21:38:06 +00001171static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001172indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001173{
1174 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001175 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001176 tok->cur = tok->inp;
1177 return 1;
1178 }
1179 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001180 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1181 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001182 tok->altwarning = 0;
1183 }
1184 return 0;
1185}
1186
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187/* Get next token, after space stripping etc. */
1188
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001189static int
1190tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191{
1192 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001193 int blankline;
1194
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001195 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001196 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001197 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001198 blankline = 0;
1199
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 /* Get indentation level */
1201 if (tok->atbol) {
1202 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001203 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 for (;;) {
1206 c = tok_nextc(tok);
1207 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001208 col++, altcol++;
1209 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001210 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001211 altcol = (altcol/tok->alttabsize + 1)
1212 * tok->alttabsize;
1213 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001214 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001215 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216 else
1217 break;
1218 }
1219 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001220 if (c == '#' || c == '\n') {
1221 /* Lines with only whitespace and/or comments
1222 shouldn't affect the indentation and are
1223 not passed to the parser as NEWLINE tokens,
1224 except *totally* empty lines in interactive
1225 mode, which signal the end of a command group. */
1226 if (col == 0 && c == '\n' && tok->prompt != NULL)
1227 blankline = 0; /* Let it through */
1228 else
1229 blankline = 1; /* Ignore completely */
1230 /* We can't jump back right here since we still
1231 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001233 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001234 if (col == tok->indstack[tok->indent]) {
1235 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001236 if (altcol != tok->altindstack[tok->indent]) {
1237 if (indenterror(tok))
1238 return ERRORTOKEN;
1239 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001241 else if (col > tok->indstack[tok->indent]) {
1242 /* Indent -- always one */
1243 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001244 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001245 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001246 return ERRORTOKEN;
1247 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001248 if (altcol <= tok->altindstack[tok->indent]) {
1249 if (indenterror(tok))
1250 return ERRORTOKEN;
1251 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001252 tok->pendin++;
1253 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001254 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001256 else /* col < tok->indstack[tok->indent] */ {
1257 /* Dedent -- any number, must be consistent */
1258 while (tok->indent > 0 &&
1259 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001260 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001261 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001262 }
1263 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001264 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001265 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001266 return ERRORTOKEN;
1267 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001268 if (altcol != tok->altindstack[tok->indent]) {
1269 if (indenterror(tok))
1270 return ERRORTOKEN;
1271 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 }
1273 }
1274 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001275
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001276 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001277
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 /* Return pending indents/dedents */
1279 if (tok->pendin != 0) {
1280 if (tok->pendin < 0) {
1281 tok->pendin++;
1282 return DEDENT;
1283 }
1284 else {
1285 tok->pendin--;
1286 return INDENT;
1287 }
1288 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001291 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 /* Skip spaces */
1293 do {
1294 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001295 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001296
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001297 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001298 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001299
Guido van Rossumab5ca152000-03-31 00:52:27 +00001300 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001302 static char *tabforms[] = {
1303 "tab-width:", /* Emacs */
1304 ":tabstop=", /* vim, full form */
1305 ":ts=", /* vim, abbreviated form */
1306 "set tabsize=", /* will vi never die? */
1307 /* more templates can be added here to support other editors */
1308 };
1309 char cbuf[80];
1310 char *tp, **cp;
1311 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001312 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001313 *tp++ = c = tok_nextc(tok);
1314 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001315 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001316 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001317 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001318 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1319 cp++) {
1320 if ((tp = strstr(cbuf, *cp))) {
1321 int newsize = atoi(tp + strlen(*cp));
1322
1323 if (newsize >= 1 && newsize <= 40) {
1324 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001325 if (Py_VerboseFlag)
1326 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001327 "Tab size set to %d\n",
1328 newsize);
1329 }
1330 }
1331 }
1332 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001335
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001337 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001339 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001340
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 /* Identifier (most frequent token!) */
Benjamin Peterson88623d72010-04-03 23:03:35 +00001342 if (Py_ISALPHA(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001343 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001344 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001345 case 'b':
1346 case 'B':
1347 c = tok_nextc(tok);
1348 if (c == 'r' || c == 'R')
1349 c = tok_nextc(tok);
1350 if (c == '"' || c == '\'')
1351 goto letter_quote;
1352 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001353 case 'r':
1354 case 'R':
1355 c = tok_nextc(tok);
1356 if (c == '"' || c == '\'')
1357 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001358 break;
1359 case 'u':
1360 case 'U':
1361 c = tok_nextc(tok);
1362 if (c == 'r' || c == 'R')
1363 c = tok_nextc(tok);
1364 if (c == '"' || c == '\'')
1365 goto letter_quote;
1366 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001367 }
Benjamin Peterson88623d72010-04-03 23:03:35 +00001368 while (Py_ISALNUM(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001370 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001372 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001373 *p_end = tok->cur;
1374 return NAME;
1375 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001376
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001377 /* Newline */
1378 if (c == '\n') {
1379 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001380 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001381 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001384 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 return NEWLINE;
1386 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001387
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001388 /* Period or number starting with period? */
1389 if (c == '.') {
1390 c = tok_nextc(tok);
1391 if (isdigit(c)) {
1392 goto fraction;
1393 }
1394 else {
1395 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001396 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001397 *p_end = tok->cur;
1398 return DOT;
1399 }
1400 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001401
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 /* Number */
1403 if (isdigit(c)) {
1404 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001405 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001406 c = tok_nextc(tok);
1407 if (c == '.')
1408 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001409#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001410 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001411 goto imaginary;
1412#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001414
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001416 c = tok_nextc(tok);
1417 if (!isxdigit(c)) {
1418 tok->done = E_TOKEN;
1419 tok_backup(tok, c);
1420 return ERRORTOKEN;
1421 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 do {
1423 c = tok_nextc(tok);
1424 } while (isxdigit(c));
1425 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001426 else if (c == 'o' || c == 'O') {
1427 /* Octal */
1428 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001429 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001430 tok->done = E_TOKEN;
1431 tok_backup(tok, c);
1432 return ERRORTOKEN;
1433 }
1434 do {
1435 c = tok_nextc(tok);
1436 } while ('0' <= c && c < '8');
1437 }
1438 else if (c == 'b' || c == 'B') {
1439 /* Binary */
1440 c = tok_nextc(tok);
1441 if (c != '0' && c != '1') {
1442 tok->done = E_TOKEN;
1443 tok_backup(tok, c);
1444 return ERRORTOKEN;
1445 }
1446 do {
1447 c = tok_nextc(tok);
1448 } while (c == '0' || c == '1');
1449 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001450 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001451 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001452 /* Octal; c is first char of it */
1453 /* There's no 'isoctdigit' macro, sigh */
1454 while ('0' <= c && c < '8') {
1455 c = tok_nextc(tok);
1456 }
Tim Petersd507dab2001-08-30 20:51:59 +00001457 if (isdigit(c)) {
1458 found_decimal = 1;
1459 do {
1460 c = tok_nextc(tok);
1461 } while (isdigit(c));
1462 }
1463 if (c == '.')
1464 goto fraction;
1465 else if (c == 'e' || c == 'E')
1466 goto exponent;
1467#ifndef WITHOUT_COMPLEX
1468 else if (c == 'j' || c == 'J')
1469 goto imaginary;
1470#endif
1471 else if (found_decimal) {
1472 tok->done = E_TOKEN;
1473 tok_backup(tok, c);
1474 return ERRORTOKEN;
1475 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001477 if (c == 'l' || c == 'L')
1478 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479 }
1480 else {
1481 /* Decimal */
1482 do {
1483 c = tok_nextc(tok);
1484 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001485 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001487 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001488 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001489 if (c == '.') {
1490 fraction:
1491 /* Fraction */
1492 do {
1493 c = tok_nextc(tok);
1494 } while (isdigit(c));
1495 }
1496 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001497 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001498 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001499 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001500 if (c == '+' || c == '-')
1501 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001502 if (!isdigit(c)) {
1503 tok->done = E_TOKEN;
1504 tok_backup(tok, c);
1505 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001506 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001507 do {
1508 c = tok_nextc(tok);
1509 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001511#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001512 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001513 /* Imaginary part */
1514 imaginary:
1515 c = tok_nextc(tok);
1516#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001517 }
1518 }
1519 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001520 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 *p_end = tok->cur;
1522 return NUMBER;
1523 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001524
1525 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001526 /* String */
1527 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001529 int quote = c;
1530 int triple = 0;
1531 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001532 for (;;) {
1533 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001534 if (c == '\n') {
1535 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001536 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001537 tok_backup(tok, c);
1538 return ERRORTOKEN;
1539 }
1540 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001541 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001542 }
1543 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001544 if (triple)
1545 tok->done = E_EOFS;
1546 else
1547 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001548 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001549 return ERRORTOKEN;
1550 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001551 else if (c == quote) {
1552 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001553 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001554 c = tok_nextc(tok);
1555 if (c == quote) {
1556 triple = 1;
1557 tripcount = 0;
1558 continue;
1559 }
1560 tok_backup(tok, c);
1561 }
1562 if (!triple || tripcount == 3)
1563 break;
1564 }
1565 else if (c == '\\') {
1566 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001567 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001568 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001569 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001570 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001571 return ERRORTOKEN;
1572 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001573 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001574 else
1575 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001576 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001577 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001578 *p_end = tok->cur;
1579 return STRING;
1580 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001581
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001582 /* Line continuation */
1583 if (c == '\\') {
1584 c = tok_nextc(tok);
1585 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001586 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001587 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001588 return ERRORTOKEN;
1589 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001590 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001591 goto again; /* Read next line */
1592 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001593
Guido van Rossumfbab9051991-10-20 20:25:03 +00001594 /* Check for two-character token */
1595 {
1596 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001597 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001598#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001599 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001600 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001601 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001602 tok->filename, tok->lineno,
1603 NULL, NULL)) {
1604 return ERRORTOKEN;
1605 }
1606 }
1607#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001608 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001609 int c3 = tok_nextc(tok);
1610 int token3 = PyToken_ThreeChars(c, c2, c3);
1611 if (token3 != OP) {
1612 token = token3;
1613 } else {
1614 tok_backup(tok, c3);
1615 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001616 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001617 *p_end = tok->cur;
1618 return token;
1619 }
1620 tok_backup(tok, c2);
1621 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001622
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001623 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001624 switch (c) {
1625 case '(':
1626 case '[':
1627 case '{':
1628 tok->level++;
1629 break;
1630 case ')':
1631 case ']':
1632 case '}':
1633 tok->level--;
1634 break;
1635 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001636
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001637 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001638 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001639 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001640 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001641}
1642
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001643int
1644PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1645{
1646 int result = tok_get(tok, p_start, p_end);
1647 if (tok->decoding_erred) {
1648 result = ERRORTOKEN;
1649 tok->done = E_DECODE;
1650 }
1651 return result;
1652}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001653
Martin v. Löwisa5136192007-09-04 14:19:28 +00001654/* This function is only called from parsetok. However, it cannot live
1655 there, as it must be empty for PGEN, and we can check for PGEN only
1656 in this file. */
1657
Christian Heimes082c9b02008-01-23 14:20:50 +00001658#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001659char*
1660PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1661{
1662 return NULL;
1663}
1664#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001665#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001666static PyObject *
1667dec_utf8(const char *enc, const char *text, size_t len) {
1668 PyObject *ret = NULL;
1669 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1670 if (unicode_text) {
1671 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1672 Py_DECREF(unicode_text);
1673 }
1674 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001675 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001676 }
1677 return ret;
1678}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001679char *
1680PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1681{
1682 char *text = NULL;
1683 if (tok->encoding) {
1684 /* convert source to original encondig */
1685 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1686 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001687 int linelen = PyString_Size(lineobj);
1688 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001689 text = PyObject_MALLOC(linelen + 1);
1690 if (text != NULL && line != NULL) {
1691 if (linelen)
1692 strncpy(text, line, linelen);
1693 text[linelen] = '\0';
1694 }
1695 Py_DECREF(lineobj);
1696
1697 /* adjust error offset */
1698 if (*offset > 1) {
1699 PyObject *offsetobj = dec_utf8(tok->encoding,
1700 tok->buf, *offset-1);
1701 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001702 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001703 Py_DECREF(offsetobj);
1704 }
1705 }
1706
1707 }
1708 }
1709 return text;
1710
1711}
Georg Brandl76b30d12008-01-07 18:41:34 +00001712#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001713#endif
1714
Martin v. Löwisa5136192007-09-04 14:19:28 +00001715
Guido van Rossum408027e1996-12-30 16:17:54 +00001716#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001717
1718void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001719tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001720{
Guido van Rossum86bea461997-04-29 21:03:06 +00001721 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001722 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1723 printf("(%.*s)", (int)(end - start), start);
1724}
1725
1726#endif