blob: 04749c865798e9daee43a0789461b0e7d7a5c590 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000108 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 tok->tabsize = TABSIZE;
110 tok->indent = 0;
111 tok->indstack[0] = 0;
112 tok->atbol = 1;
113 tok->pendin = 0;
114 tok->prompt = tok->nextprompt = NULL;
115 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000116 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000117 tok->filename = NULL;
118 tok->altwarning = 0;
119 tok->alterror = 0;
120 tok->alttabsize = 1;
121 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122 tok->decoding_state = 0;
123 tok->decoding_erred = 0;
124 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000125 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000126 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000127#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000128 tok->decoding_readline = NULL;
129 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000130#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131 return tok;
132}
133
Benjamin Petersone36199b2009-11-12 23:39:44 +0000134static char *
135new_string(const char *s, Py_ssize_t len)
136{
137 char* result = (char *)PyMem_MALLOC(len + 1);
138 if (result != NULL) {
139 memcpy(result, s, len);
140 result[len] = '\0';
141 }
142 return result;
143}
144
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145#ifdef PGEN
146
147static char *
148decoding_fgets(char *s, int size, struct tok_state *tok)
149{
150 return fgets(s, size, tok->fp);
151}
152
153static int
154decoding_feof(struct tok_state *tok)
155{
156 return feof(tok->fp);
157}
158
Benjamin Petersone36199b2009-11-12 23:39:44 +0000159static char *
160decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161{
Benjamin Petersone36199b2009-11-12 23:39:44 +0000162 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165#else /* PGEN */
166
167static char *
168error_ret(struct tok_state *tok) /* XXX */
169{
170 tok->decoding_erred = 1;
171 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000172 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173 tok->buf = NULL;
174 return NULL; /* as if it were EOF */
175}
176
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177
178static char *
179get_normal_name(char *s) /* for utf-8 and latin-1 */
180{
181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000320 ch = get_char(tok);
321 if (ch != 0xBB)
322 goto NON_BOM;
323 ch = get_char(tok);
324 if (ch != 0xBF)
325 goto NON_BOM;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326#if 0
327 /* Disable support for UTF-16 BOMs until a decision
328 is made whether this needs to be supported. */
329 } else if (ch == 0xFE) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000330 ch = get_char(tok);
331 if (ch != 0xFF)
332 goto NON_BOM;
333 if (!set_readline(tok, "utf-16-be"))
334 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335 tok->decoding_state = -1;
336 } else if (ch == 0xFF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000337 ch = get_char(tok);
338 if (ch != 0xFE)
339 goto NON_BOM;
340 if (!set_readline(tok, "utf-16-le"))
341 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000342 tok->decoding_state = -1;
343#endif
344 } else {
345 unget_char(ch, tok);
346 return 1;
347 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000348 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000349 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
351 return 1;
352 NON_BOM:
353 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
354 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
355 return 1;
356}
357
358/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000360
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 On entry, tok->decoding_buffer will be one of:
362 1) NULL: need to call tok->decoding_readline to get a new line
363 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
364 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000365 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 (in the s buffer) to copy entire contents of the line read
367 by tok->decoding_readline. tok->decoding_buffer has the overflow.
368 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000369 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 reached): see tok_nextc and its calls to decoding_fgets.
371*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372
373static char *
374fp_readl(char *s, int size, struct tok_state *tok)
375{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000376#ifndef Py_USING_UNICODE
377 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000378 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000379 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000380#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000384 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385
386 /* Ask for one less byte so we can terminate it */
387 assert(size > 0);
388 size--;
389
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000390 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000392 if (buf == NULL)
393 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394 } else {
395 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000396 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399 if (utf8 == NULL) {
400 utf8 = PyUnicode_AsUTF8String(buf);
401 Py_DECREF(buf);
402 if (utf8 == NULL)
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000405 str = PyString_AsString(utf8);
406 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000408 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 if (tok->decoding_buffer == NULL) {
410 Py_DECREF(utf8);
411 return error_ret(tok);
412 }
413 utf8len = size;
414 }
415 memcpy(s, str, utf8len);
416 s[utf8len] = '\0';
417 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000418 if (utf8len == 0)
419 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000420 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000421#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000422}
423
424/* Set the readline function for TOK to a StreamReader's
425 readline function. The StreamReader is named ENC.
426
427 This function is called from check_bom and check_coding_spec.
428
429 ENC is usually identical to the future value of tok->encoding,
430 except for the (currently unsupported) case of UTF-16.
431
432 Return 1 on success, 0 on failure. */
433
434static int
435fp_setreadl(struct tok_state *tok, const char* enc)
436{
437 PyObject *reader, *stream, *readline;
438
Martin v. Löwis95292d62002-12-11 14:04:59 +0000439 /* XXX: constify filename argument. */
440 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000441 if (stream == NULL)
442 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443
444 reader = PyCodec_StreamReader(enc, stream, NULL);
445 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000446 if (reader == NULL)
447 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000448
449 readline = PyObject_GetAttrString(reader, "readline");
450 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000451 if (readline == NULL)
452 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453
454 tok->decoding_readline = readline;
455 return 1;
456}
457
458/* Fetch the next byte from TOK. */
459
460static int fp_getc(struct tok_state *tok) {
461 return getc(tok->fp);
462}
463
464/* Unfetch the last byte back into TOK. */
465
466static void fp_ungetc(int c, struct tok_state *tok) {
467 ungetc(c, tok->fp);
468}
469
470/* Read a line of input from TOK. Determine encoding
471 if necessary. */
472
473static char *
474decoding_fgets(char *s, int size, struct tok_state *tok)
475{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000476 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000477 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000478 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 if (tok->decoding_state < 0) {
480 /* We already have a codec associated with
481 this input. */
482 line = fp_readl(s, size, tok);
483 break;
484 } else if (tok->decoding_state > 0) {
485 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000486 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 break;
489 } else {
490 /* We have not yet determined the encoding.
491 If an encoding is found, use the file-pointer
492 reader functions from now on. */
493 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
494 return error_ret(tok);
495 assert(tok->decoding_state != 0);
496 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000497 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
499 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
500 return error_ret(tok);
501 }
502 }
503#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000504 /* The default encoding is ASCII, so make sure we don't have any
505 non-ASCII bytes in it. */
506 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000508 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000509 if (*c > 127) {
510 badchar = *c;
511 break;
512 }
513 }
514 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000515 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000516 /* Need to add 1 to the line number, since this line
517 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000518 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000519 "Non-ASCII character '\\x%.2x' "
520 "in file %.200s on line %i, "
521 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000522 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000523 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000524 PyErr_SetString(PyExc_SyntaxError, buf);
525 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526 }
527#endif
528 return line;
529}
530
531static int
532decoding_feof(struct tok_state *tok)
533{
534 if (tok->decoding_state >= 0) {
535 return feof(tok->fp);
536 } else {
537 PyObject* buf = tok->decoding_buffer;
538 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000539 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 if (buf == NULL) {
541 error_ret(tok);
542 return 1;
543 } else {
544 tok->decoding_buffer = buf;
545 }
546 }
547 return PyObject_Length(buf) == 0;
548 }
549}
550
551/* Fetch a byte from TOK, using the string buffer. */
552
Tim Petersc9d78aa2006-03-26 23:27:58 +0000553static int
554buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000555 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556}
557
558/* Unfetch a byte from TOK, using the string buffer. */
559
Tim Petersc9d78aa2006-03-26 23:27:58 +0000560static void
561buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000563 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000564}
565
566/* Set the readline function for TOK to ENC. For the string-based
567 tokenizer, this means to just record the encoding. */
568
Tim Petersc9d78aa2006-03-26 23:27:58 +0000569static int
570buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 tok->enc = enc;
572 return 1;
573}
574
575/* Return a UTF-8 encoding Python string object from the
576 C byte string STR, which is encoded with ENC. */
577
Martin v. Löwis019934b2002-08-07 12:33:18 +0000578#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579static PyObject *
580translate_into_utf8(const char* str, const char* enc) {
581 PyObject *utf8;
582 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
583 if (buf == NULL)
584 return NULL;
585 utf8 = PyUnicode_AsUTF8String(buf);
586 Py_DECREF(buf);
587 return utf8;
588}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000589#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590
Benjamin Petersone36199b2009-11-12 23:39:44 +0000591
592static char *
593translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000594 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000595 char *buf, *current;
Benjamin Peterson42d63842009-12-06 17:37:48 +0000596 char c = '\0';
597 buf = PyMem_MALLOC(needed_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000598 if (buf == NULL) {
599 tok->done = E_NOMEM;
600 return NULL;
601 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000602 for (current = buf; *s; s++, current++) {
603 c = *s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000604 if (skip_next_lf) {
605 skip_next_lf = 0;
606 if (c == '\n') {
Benjamin Peterson42d63842009-12-06 17:37:48 +0000607 c = *++s;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000608 if (!c)
609 break;
610 }
611 }
612 if (c == '\r') {
613 skip_next_lf = 1;
614 c = '\n';
615 }
616 *current = c;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000617 }
Benjamin Peterson42d63842009-12-06 17:37:48 +0000618 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersone36199b2009-11-12 23:39:44 +0000619 there isn't one already. */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000620 if (exec_input && c != '\n') {
Benjamin Petersone36199b2009-11-12 23:39:44 +0000621 *current = '\n';
622 current++;
623 }
624 *current = '\0';
Benjamin Peterson42d63842009-12-06 17:37:48 +0000625 final_length = current - buf + 1;
626 if (final_length < needed_length && final_length)
Benjamin Petersone36199b2009-11-12 23:39:44 +0000627 /* should never fail */
Benjamin Peterson42d63842009-12-06 17:37:48 +0000628 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000629 return buf;
630}
631
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632/* Decode a byte string STR for use as the buffer of TOK.
633 Look for encoding declarations inside STR, and record them
634 inside TOK. */
635
636static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000637decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000638{
639 PyObject* utf8 = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000640 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000642 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643 int lineno = 0;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000644 tok->input = str = translate_newlines(input, single, tok);
645 if (str == NULL)
646 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000647 tok->enc = NULL;
648 tok->str = str;
649 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000650 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000651 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000652 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000653#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000654 if (tok->enc != NULL) {
655 utf8 = translate_into_utf8(str, tok->enc);
656 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000657 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000658 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000660#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 for (s = str;; s++) {
662 if (*s == '\0') break;
663 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000664 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000665 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666 lineno++;
667 if (lineno == 2) break;
668 }
669 }
670 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000671 /* need to check line 1 and 2 separately since check_coding_spec
672 assumes a single line as input */
673 if (newl[0]) {
674 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
675 return error_ret(tok);
676 if (tok->enc == NULL && newl[1]) {
677 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
678 tok, buf_setreadl))
679 return error_ret(tok);
680 }
681 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000682#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683 if (tok->enc != NULL) {
684 assert(utf8 == NULL);
685 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000686 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000687 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000688 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000689 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000690#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 assert(tok->decoding_buffer == NULL);
692 tok->decoding_buffer = utf8; /* CAUTION */
693 return str;
694}
695
696#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000697
698/* Set up tokenizer for string */
699
700struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000701PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000702{
703 struct tok_state *tok = tok_new();
704 if (tok == NULL)
705 return NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000706 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000707 if (str == NULL) {
708 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000709 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000710 }
711
Martin v. Löwis95292d62002-12-11 14:04:59 +0000712 /* XXX: constify members. */
713 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000714 return tok;
715}
716
717
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000718/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000719
720struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000721PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722{
723 struct tok_state *tok = tok_new();
724 if (tok == NULL)
725 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000726 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000727 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000728 return NULL;
729 }
730 tok->cur = tok->inp = tok->buf;
731 tok->end = tok->buf + BUFSIZ;
732 tok->fp = fp;
733 tok->prompt = ps1;
734 tok->nextprompt = ps2;
735 return tok;
736}
737
738
739/* Free a tok_state structure */
740
741void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000742PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000744 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000745 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000746#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000747 Py_XDECREF(tok->decoding_readline);
748 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000749#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000751 PyMem_FREE(tok->buf);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000752 if (tok->input)
753 PyMem_FREE((char *)tok->input);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000754 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755}
756
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000757#if !defined(PGEN) && defined(Py_USING_UNICODE)
758static int
759tok_stdin_decode(struct tok_state *tok, char **inp)
760{
761 PyObject *enc, *sysstdin, *decoded, *utf8;
762 const char *encoding;
763 char *converted;
764
765 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
766 return 0;
767 sysstdin = PySys_GetObject("stdin");
768 if (sysstdin == NULL || !PyFile_Check(sysstdin))
769 return 0;
770
771 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000772 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000773 return 0;
774 Py_INCREF(enc);
775
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000776 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000777 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
778 if (decoded == NULL)
779 goto error_clear;
780
781 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
782 Py_DECREF(decoded);
783 if (utf8 == NULL)
784 goto error_clear;
785
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000786 assert(PyString_Check(utf8));
787 converted = new_string(PyString_AS_STRING(utf8),
788 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000789 Py_DECREF(utf8);
790 if (converted == NULL)
791 goto error_nomem;
792
Neal Norwitz08062d62006-04-11 08:19:15 +0000793 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000794 *inp = converted;
795 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000796 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000797 tok->encoding = new_string(encoding, strlen(encoding));
798 if (tok->encoding == NULL)
799 goto error_nomem;
800
801 Py_DECREF(enc);
802 return 0;
803
804error_nomem:
805 Py_DECREF(enc);
806 tok->done = E_NOMEM;
807 return -1;
808
809error_clear:
810 /* Fallback to iso-8859-1: for backward compatibility */
811 Py_DECREF(enc);
812 PyErr_Clear();
813 return 0;
814}
815#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816
817/* Get next char, updating state; error code goes into tok->done */
818
819static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000820tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000821{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000823 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000824 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000825 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000826 if (tok->done != E_OK)
827 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000829 char *end = strchr(tok->inp, '\n');
830 if (end != NULL)
831 end++;
832 else {
833 end = strchr(tok->inp, '\0');
834 if (end == tok->inp) {
835 tok->done = E_EOF;
836 return EOF;
837 }
838 }
839 if (tok->start == NULL)
840 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000841 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000842 tok->lineno++;
843 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000844 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000845 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000847 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000848 if (tok->nextprompt != NULL)
849 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000850 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000851 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000852 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000853 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854 tok->done = E_EOF;
855 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000856#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000857 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000858 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000859#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000861 size_t start = tok->start - tok->buf;
862 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000863 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000864 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000865 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000866 tok->lineno++;
867 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000868 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000869 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000870 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000871 tok->done = E_NOMEM;
872 return EOF;
873 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 tok->buf = buf;
875 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000876 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000877 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000878 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 tok->inp = tok->buf + newlen;
880 tok->end = tok->inp + 1;
881 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000883 else {
884 tok->lineno++;
885 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000886 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000887 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000888 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000890 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000891 tok->inp = strchr(tok->buf, '\0');
892 tok->end = tok->inp + 1;
893 }
894 }
895 else {
896 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000898 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000899 if (tok->start == NULL) {
900 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000901 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000902 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000903 if (tok->buf == NULL) {
904 tok->done = E_NOMEM;
905 return EOF;
906 }
907 tok->end = tok->buf + BUFSIZ;
908 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000909 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
910 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000911 tok->done = E_EOF;
912 done = 1;
913 }
914 else {
915 tok->done = E_OK;
916 tok->inp = strchr(tok->buf, '\0');
917 done = tok->inp[-1] == '\n';
918 }
919 }
920 else {
921 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000922 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000923 tok->done = E_EOF;
924 done = 1;
925 }
926 else
927 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000928 }
929 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000930 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000931 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000932 Py_ssize_t curstart = tok->start == NULL ? -1 :
933 tok->start - tok->buf;
934 Py_ssize_t curvalid = tok->inp - tok->buf;
935 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000936 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000937 newbuf = (char *)PyMem_REALLOC(newbuf,
938 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000939 if (newbuf == NULL) {
940 tok->done = E_NOMEM;
941 tok->cur = tok->inp;
942 return EOF;
943 }
944 tok->buf = newbuf;
945 tok->inp = tok->buf + curvalid;
946 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000947 tok->start = curstart < 0 ? NULL :
948 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000949 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000950 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000951 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000952 /* Break out early on decoding
953 errors, as tok->buf will be NULL
954 */
955 if (tok->decoding_erred)
956 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000957 /* Last line does not end in \n,
958 fake one */
959 strcpy(tok->inp, "\n");
960 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000961 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000962 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000963 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000964 if (tok->buf != NULL) {
965 tok->cur = tok->buf + cur;
966 tok->line_start = tok->cur;
967 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000968 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000969 pt = tok->inp - 2;
970 if (pt >= tok->buf && *pt == '\r') {
971 *pt++ = '\n';
972 *pt = '\0';
973 tok->inp = pt;
974 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000975 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000976 }
977 if (tok->done != E_OK) {
978 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000979 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000980 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000981 return EOF;
982 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000984 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000985}
986
987
988/* Back-up one character */
989
990static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000991tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000992{
993 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000994 if (--tok->cur < tok->buf)
Benjamin Petersone3383b82009-11-07 01:04:38 +0000995 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000996 if (*tok->cur != c)
997 *tok->cur = c;
998 }
999}
1000
1001
1002/* Return the token corresponding to a single character */
1003
1004int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001005PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006{
1007 switch (c) {
1008 case '(': return LPAR;
1009 case ')': return RPAR;
1010 case '[': return LSQB;
1011 case ']': return RSQB;
1012 case ':': return COLON;
1013 case ',': return COMMA;
1014 case ';': return SEMI;
1015 case '+': return PLUS;
1016 case '-': return MINUS;
1017 case '*': return STAR;
1018 case '/': return SLASH;
1019 case '|': return VBAR;
1020 case '&': return AMPER;
1021 case '<': return LESS;
1022 case '>': return GREATER;
1023 case '=': return EQUAL;
1024 case '.': return DOT;
1025 case '%': return PERCENT;
1026 case '`': return BACKQUOTE;
1027 case '{': return LBRACE;
1028 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001029 case '^': return CIRCUMFLEX;
1030 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001031 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032 default: return OP;
1033 }
1034}
1035
1036
Guido van Rossumfbab9051991-10-20 20:25:03 +00001037int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001038PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001039{
1040 switch (c1) {
1041 case '=':
1042 switch (c2) {
1043 case '=': return EQEQUAL;
1044 }
1045 break;
1046 case '!':
1047 switch (c2) {
1048 case '=': return NOTEQUAL;
1049 }
1050 break;
1051 case '<':
1052 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001053 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001054 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001055 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001056 }
1057 break;
1058 case '>':
1059 switch (c2) {
1060 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001061 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001062 }
1063 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001064 case '+':
1065 switch (c2) {
1066 case '=': return PLUSEQUAL;
1067 }
1068 break;
1069 case '-':
1070 switch (c2) {
1071 case '=': return MINEQUAL;
1072 }
1073 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001074 case '*':
1075 switch (c2) {
1076 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 case '=': return STAREQUAL;
1078 }
1079 break;
1080 case '/':
1081 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001082 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 case '=': return SLASHEQUAL;
1084 }
1085 break;
1086 case '|':
1087 switch (c2) {
1088 case '=': return VBAREQUAL;
1089 }
1090 break;
1091 case '%':
1092 switch (c2) {
1093 case '=': return PERCENTEQUAL;
1094 }
1095 break;
1096 case '&':
1097 switch (c2) {
1098 case '=': return AMPEREQUAL;
1099 }
1100 break;
1101 case '^':
1102 switch (c2) {
1103 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001104 }
1105 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001106 }
1107 return OP;
1108}
1109
Thomas Wouters434d0822000-08-24 20:11:32 +00001110int
1111PyToken_ThreeChars(int c1, int c2, int c3)
1112{
1113 switch (c1) {
1114 case '<':
1115 switch (c2) {
1116 case '<':
1117 switch (c3) {
1118 case '=':
1119 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001120 }
1121 break;
1122 }
1123 break;
1124 case '>':
1125 switch (c2) {
1126 case '>':
1127 switch (c3) {
1128 case '=':
1129 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001130 }
1131 break;
1132 }
1133 break;
1134 case '*':
1135 switch (c2) {
1136 case '*':
1137 switch (c3) {
1138 case '=':
1139 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001140 }
1141 break;
1142 }
1143 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001144 case '/':
1145 switch (c2) {
1146 case '/':
1147 switch (c3) {
1148 case '=':
1149 return DOUBLESLASHEQUAL;
1150 }
1151 break;
1152 }
1153 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001154 }
1155 return OP;
1156}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001157
Guido van Rossum926f13a1998-04-09 21:38:06 +00001158static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001159indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160{
1161 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001162 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001163 tok->cur = tok->inp;
1164 return 1;
1165 }
1166 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001167 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1168 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001169 tok->altwarning = 0;
1170 }
1171 return 0;
1172}
1173
1174
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175/* Get next token, after space stripping etc. */
1176
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001177static int
1178tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179{
1180 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001181 int blankline;
1182
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001183 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001184 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001185 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001186 blankline = 0;
1187
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001188 /* Get indentation level */
1189 if (tok->atbol) {
1190 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001191 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193 for (;;) {
1194 c = tok_nextc(tok);
1195 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001196 col++, altcol++;
1197 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001199 altcol = (altcol/tok->alttabsize + 1)
1200 * tok->alttabsize;
1201 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001202 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001203 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 else
1205 break;
1206 }
1207 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001208 if (c == '#' || c == '\n') {
1209 /* Lines with only whitespace and/or comments
1210 shouldn't affect the indentation and are
1211 not passed to the parser as NEWLINE tokens,
1212 except *totally* empty lines in interactive
1213 mode, which signal the end of a command group. */
1214 if (col == 0 && c == '\n' && tok->prompt != NULL)
1215 blankline = 0; /* Let it through */
1216 else
1217 blankline = 1; /* Ignore completely */
1218 /* We can't jump back right here since we still
1219 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001221 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001222 if (col == tok->indstack[tok->indent]) {
1223 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001224 if (altcol != tok->altindstack[tok->indent]) {
1225 if (indenterror(tok))
1226 return ERRORTOKEN;
1227 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001228 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001229 else if (col > tok->indstack[tok->indent]) {
1230 /* Indent -- always one */
1231 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001232 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001233 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001234 return ERRORTOKEN;
1235 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001236 if (altcol <= tok->altindstack[tok->indent]) {
1237 if (indenterror(tok))
1238 return ERRORTOKEN;
1239 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001240 tok->pendin++;
1241 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001242 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001243 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001244 else /* col < tok->indstack[tok->indent] */ {
1245 /* Dedent -- any number, must be consistent */
1246 while (tok->indent > 0 &&
1247 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001248 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001249 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001250 }
1251 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001252 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001253 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001254 return ERRORTOKEN;
1255 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001256 if (altcol != tok->altindstack[tok->indent]) {
1257 if (indenterror(tok))
1258 return ERRORTOKEN;
1259 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001260 }
1261 }
1262 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001263
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001264 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001265
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 /* Return pending indents/dedents */
1267 if (tok->pendin != 0) {
1268 if (tok->pendin < 0) {
1269 tok->pendin++;
1270 return DEDENT;
1271 }
1272 else {
1273 tok->pendin--;
1274 return INDENT;
1275 }
1276 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001277
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001279 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 /* Skip spaces */
1281 do {
1282 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001283 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001284
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001286 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001287
Guido van Rossumab5ca152000-03-31 00:52:27 +00001288 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001290 static char *tabforms[] = {
1291 "tab-width:", /* Emacs */
1292 ":tabstop=", /* vim, full form */
1293 ":ts=", /* vim, abbreviated form */
1294 "set tabsize=", /* will vi never die? */
1295 /* more templates can be added here to support other editors */
1296 };
1297 char cbuf[80];
1298 char *tp, **cp;
1299 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001301 *tp++ = c = tok_nextc(tok);
1302 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001303 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001304 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001305 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001306 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1307 cp++) {
1308 if ((tp = strstr(cbuf, *cp))) {
1309 int newsize = atoi(tp + strlen(*cp));
1310
1311 if (newsize >= 1 && newsize <= 40) {
1312 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001313 if (Py_VerboseFlag)
1314 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001315 "Tab size set to %d\n",
1316 newsize);
1317 }
1318 }
1319 }
1320 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001323
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001325 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001327 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001328
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 /* Identifier (most frequent token!) */
1330 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001331 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001332 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001333 case 'b':
1334 case 'B':
1335 c = tok_nextc(tok);
1336 if (c == 'r' || c == 'R')
1337 c = tok_nextc(tok);
1338 if (c == '"' || c == '\'')
1339 goto letter_quote;
1340 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001341 case 'r':
1342 case 'R':
1343 c = tok_nextc(tok);
1344 if (c == '"' || c == '\'')
1345 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001346 break;
1347 case 'u':
1348 case 'U':
1349 c = tok_nextc(tok);
1350 if (c == 'r' || c == 'R')
1351 c = tok_nextc(tok);
1352 if (c == '"' || c == '\'')
1353 goto letter_quote;
1354 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001355 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001356 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001358 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001359 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001360 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001361 *p_end = tok->cur;
1362 return NAME;
1363 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001364
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 /* Newline */
1366 if (c == '\n') {
1367 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001368 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001369 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001370 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001372 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001373 return NEWLINE;
1374 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001375
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001376 /* Period or number starting with period? */
1377 if (c == '.') {
1378 c = tok_nextc(tok);
1379 if (isdigit(c)) {
1380 goto fraction;
1381 }
1382 else {
1383 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001385 *p_end = tok->cur;
1386 return DOT;
1387 }
1388 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001389
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001390 /* Number */
1391 if (isdigit(c)) {
1392 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001393 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 c = tok_nextc(tok);
1395 if (c == '.')
1396 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001397#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001398 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001399 goto imaginary;
1400#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001402
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001404 c = tok_nextc(tok);
1405 if (!isxdigit(c)) {
1406 tok->done = E_TOKEN;
1407 tok_backup(tok, c);
1408 return ERRORTOKEN;
1409 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001410 do {
1411 c = tok_nextc(tok);
1412 } while (isxdigit(c));
1413 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001414 else if (c == 'o' || c == 'O') {
1415 /* Octal */
1416 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001417 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001418 tok->done = E_TOKEN;
1419 tok_backup(tok, c);
1420 return ERRORTOKEN;
1421 }
1422 do {
1423 c = tok_nextc(tok);
1424 } while ('0' <= c && c < '8');
1425 }
1426 else if (c == 'b' || c == 'B') {
1427 /* Binary */
1428 c = tok_nextc(tok);
1429 if (c != '0' && c != '1') {
1430 tok->done = E_TOKEN;
1431 tok_backup(tok, c);
1432 return ERRORTOKEN;
1433 }
1434 do {
1435 c = tok_nextc(tok);
1436 } while (c == '0' || c == '1');
1437 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001439 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001440 /* Octal; c is first char of it */
1441 /* There's no 'isoctdigit' macro, sigh */
1442 while ('0' <= c && c < '8') {
1443 c = tok_nextc(tok);
1444 }
Tim Petersd507dab2001-08-30 20:51:59 +00001445 if (isdigit(c)) {
1446 found_decimal = 1;
1447 do {
1448 c = tok_nextc(tok);
1449 } while (isdigit(c));
1450 }
1451 if (c == '.')
1452 goto fraction;
1453 else if (c == 'e' || c == 'E')
1454 goto exponent;
1455#ifndef WITHOUT_COMPLEX
1456 else if (c == 'j' || c == 'J')
1457 goto imaginary;
1458#endif
1459 else if (found_decimal) {
1460 tok->done = E_TOKEN;
1461 tok_backup(tok, c);
1462 return ERRORTOKEN;
1463 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001464 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001465 if (c == 'l' || c == 'L')
1466 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467 }
1468 else {
1469 /* Decimal */
1470 do {
1471 c = tok_nextc(tok);
1472 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001473 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001475 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001476 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001477 if (c == '.') {
1478 fraction:
1479 /* Fraction */
1480 do {
1481 c = tok_nextc(tok);
1482 } while (isdigit(c));
1483 }
1484 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001485 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001486 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001487 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001488 if (c == '+' || c == '-')
1489 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001490 if (!isdigit(c)) {
1491 tok->done = E_TOKEN;
1492 tok_backup(tok, c);
1493 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001494 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001495 do {
1496 c = tok_nextc(tok);
1497 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001499#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001500 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001501 /* Imaginary part */
1502 imaginary:
1503 c = tok_nextc(tok);
1504#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001505 }
1506 }
1507 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001508 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001509 *p_end = tok->cur;
1510 return NUMBER;
1511 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001512
1513 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001514 /* String */
1515 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001517 int quote = c;
1518 int triple = 0;
1519 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001520 for (;;) {
1521 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001522 if (c == '\n') {
1523 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001524 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001525 tok_backup(tok, c);
1526 return ERRORTOKEN;
1527 }
1528 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001529 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001530 }
1531 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001532 if (triple)
1533 tok->done = E_EOFS;
1534 else
1535 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001536 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001537 return ERRORTOKEN;
1538 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001539 else if (c == quote) {
1540 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001541 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001542 c = tok_nextc(tok);
1543 if (c == quote) {
1544 triple = 1;
1545 tripcount = 0;
1546 continue;
1547 }
1548 tok_backup(tok, c);
1549 }
1550 if (!triple || tripcount == 3)
1551 break;
1552 }
1553 else if (c == '\\') {
1554 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001555 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001556 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001557 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001558 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001559 return ERRORTOKEN;
1560 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001561 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001562 else
1563 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001564 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001565 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001566 *p_end = tok->cur;
1567 return STRING;
1568 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001569
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001570 /* Line continuation */
1571 if (c == '\\') {
1572 c = tok_nextc(tok);
1573 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001574 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001575 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001576 return ERRORTOKEN;
1577 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001578 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001579 goto again; /* Read next line */
1580 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001581
Guido van Rossumfbab9051991-10-20 20:25:03 +00001582 /* Check for two-character token */
1583 {
1584 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001585 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001586#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001587 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001588 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001589 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001590 tok->filename, tok->lineno,
1591 NULL, NULL)) {
1592 return ERRORTOKEN;
1593 }
1594 }
1595#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001596 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001597 int c3 = tok_nextc(tok);
1598 int token3 = PyToken_ThreeChars(c, c2, c3);
1599 if (token3 != OP) {
1600 token = token3;
1601 } else {
1602 tok_backup(tok, c3);
1603 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001604 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001605 *p_end = tok->cur;
1606 return token;
1607 }
1608 tok_backup(tok, c2);
1609 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001610
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001611 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001612 switch (c) {
1613 case '(':
1614 case '[':
1615 case '{':
1616 tok->level++;
1617 break;
1618 case ')':
1619 case ']':
1620 case '}':
1621 tok->level--;
1622 break;
1623 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001624
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001625 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001626 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001627 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001628 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001629}
1630
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001631int
1632PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1633{
1634 int result = tok_get(tok, p_start, p_end);
1635 if (tok->decoding_erred) {
1636 result = ERRORTOKEN;
1637 tok->done = E_DECODE;
1638 }
1639 return result;
1640}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001641
Martin v. Löwisa5136192007-09-04 14:19:28 +00001642/* This function is only called from parsetok. However, it cannot live
1643 there, as it must be empty for PGEN, and we can check for PGEN only
1644 in this file. */
1645
Christian Heimes082c9b02008-01-23 14:20:50 +00001646#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001647char*
1648PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1649{
1650 return NULL;
1651}
1652#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001653#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001654static PyObject *
1655dec_utf8(const char *enc, const char *text, size_t len) {
1656 PyObject *ret = NULL;
1657 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1658 if (unicode_text) {
1659 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1660 Py_DECREF(unicode_text);
1661 }
1662 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001663 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001664 }
1665 return ret;
1666}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001667char *
1668PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1669{
1670 char *text = NULL;
1671 if (tok->encoding) {
1672 /* convert source to original encondig */
1673 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1674 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001675 int linelen = PyString_Size(lineobj);
1676 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001677 text = PyObject_MALLOC(linelen + 1);
1678 if (text != NULL && line != NULL) {
1679 if (linelen)
1680 strncpy(text, line, linelen);
1681 text[linelen] = '\0';
1682 }
1683 Py_DECREF(lineobj);
1684
1685 /* adjust error offset */
1686 if (*offset > 1) {
1687 PyObject *offsetobj = dec_utf8(tok->encoding,
1688 tok->buf, *offset-1);
1689 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001690 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001691 Py_DECREF(offsetobj);
1692 }
1693 }
1694
1695 }
1696 }
1697 return text;
1698
1699}
Georg Brandl76b30d12008-01-07 18:41:34 +00001700#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001701#endif
1702
Martin v. Löwisa5136192007-09-04 14:19:28 +00001703
Guido van Rossum408027e1996-12-30 16:17:54 +00001704#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001705
1706void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001707tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001708{
Guido van Rossum86bea461997-04-29 21:03:06 +00001709 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001710 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1711 printf("(%.*s)", (int)(end - start), start);
1712}
1713
1714#endif