blob: 1808c41fa2561fa3650247ea6f2e37283fd9b76b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Gregory P. Smithdd96db62008-06-09 04:58:54 +000015#include "stringobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
Christian Heimes729ab152007-11-23 09:10:36 +000019#include "pydebug.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000020#endif /* PGEN */
21
Martin v. Löwis566f6af2002-10-26 14:39:10 +000022extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000023/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
Guido van Rossum4fe87291992-02-26 15:24:44 +000027/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000029
Guido van Rossum3f5da241990-12-20 15:06:42 +000030/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000031static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000035/* Token names */
36
Guido van Rossum86bea461997-04-29 21:03:06 +000037char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000066 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000070 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000074 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000075 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000086 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000088 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000089 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000090 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +000099tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000100{
Anthony Baxter11490022006-04-11 05:39:14 +0000101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000103 if (tok == NULL)
104 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106 tok->done = E_OK;
107 tok->fp = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000108 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 tok->tabsize = TABSIZE;
110 tok->indent = 0;
111 tok->indstack[0] = 0;
112 tok->atbol = 1;
113 tok->pendin = 0;
114 tok->prompt = tok->nextprompt = NULL;
115 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000116 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000117 tok->filename = NULL;
118 tok->altwarning = 0;
119 tok->alterror = 0;
120 tok->alttabsize = 1;
121 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000122 tok->decoding_state = 0;
123 tok->decoding_erred = 0;
124 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000125 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000126 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000127#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000128 tok->decoding_readline = NULL;
129 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000130#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000131 return tok;
132}
133
Benjamin Petersone36199b2009-11-12 23:39:44 +0000134static char *
135new_string(const char *s, Py_ssize_t len)
136{
137 char* result = (char *)PyMem_MALLOC(len + 1);
138 if (result != NULL) {
139 memcpy(result, s, len);
140 result[len] = '\0';
141 }
142 return result;
143}
144
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145#ifdef PGEN
146
147static char *
148decoding_fgets(char *s, int size, struct tok_state *tok)
149{
150 return fgets(s, size, tok->fp);
151}
152
153static int
154decoding_feof(struct tok_state *tok)
155{
156 return feof(tok->fp);
157}
158
Benjamin Petersone36199b2009-11-12 23:39:44 +0000159static char *
160decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000161{
Benjamin Petersone36199b2009-11-12 23:39:44 +0000162 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000163}
164
165#else /* PGEN */
166
167static char *
168error_ret(struct tok_state *tok) /* XXX */
169{
170 tok->decoding_erred = 1;
171 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Neal Norwitz08062d62006-04-11 08:19:15 +0000172 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173 tok->buf = NULL;
174 return NULL; /* as if it were EOF */
175}
176
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177
178static char *
179get_normal_name(char *s) /* for utf-8 and latin-1 */
180{
181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Neal Norwitz08062d62006-04-11 08:19:15 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Neal Norwitz08062d62006-04-11 08:19:15 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Neal Norwitz08062d62006-04-11 08:19:15 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000320 ch = get_char(tok);
321 if (ch != 0xBB)
322 goto NON_BOM;
323 ch = get_char(tok);
324 if (ch != 0xBF)
325 goto NON_BOM;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000326#if 0
327 /* Disable support for UTF-16 BOMs until a decision
328 is made whether this needs to be supported. */
329 } else if (ch == 0xFE) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000330 ch = get_char(tok);
331 if (ch != 0xFF)
332 goto NON_BOM;
333 if (!set_readline(tok, "utf-16-be"))
334 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335 tok->decoding_state = -1;
336 } else if (ch == 0xFF) {
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000337 ch = get_char(tok);
338 if (ch != 0xFE)
339 goto NON_BOM;
340 if (!set_readline(tok, "utf-16-le"))
341 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000342 tok->decoding_state = -1;
343#endif
344 } else {
345 unget_char(ch, tok);
346 return 1;
347 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000348 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000349 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
351 return 1;
352 NON_BOM:
353 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
354 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
355 return 1;
356}
357
358/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000360
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 On entry, tok->decoding_buffer will be one of:
362 1) NULL: need to call tok->decoding_readline to get a new line
363 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
364 stored the result in tok->decoding_buffer
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000365 3) PyStringObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 (in the s buffer) to copy entire contents of the line read
367 by tok->decoding_readline. tok->decoding_buffer has the overflow.
368 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000369 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 reached): see tok_nextc and its calls to decoding_fgets.
371*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372
373static char *
374fp_readl(char *s, int size, struct tok_state *tok)
375{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000376#ifndef Py_USING_UNICODE
377 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000378 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000379 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000380#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000384 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385
386 /* Ask for one less byte so we can terminate it */
387 assert(size > 0);
388 size--;
389
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000390 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000392 if (buf == NULL)
393 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394 } else {
395 tok->decoding_buffer = NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000396 if (PyString_CheckExact(buf))
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000399 if (utf8 == NULL) {
400 utf8 = PyUnicode_AsUTF8String(buf);
401 Py_DECREF(buf);
402 if (utf8 == NULL)
403 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000404 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000405 str = PyString_AsString(utf8);
406 utf8len = PyString_GET_SIZE(utf8);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000407 if (utf8len > size) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000408 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000409 if (tok->decoding_buffer == NULL) {
410 Py_DECREF(utf8);
411 return error_ret(tok);
412 }
413 utf8len = size;
414 }
415 memcpy(s, str, utf8len);
416 s[utf8len] = '\0';
417 Py_DECREF(utf8);
Benjamin Peterson9586cf82009-10-09 21:48:14 +0000418 if (utf8len == 0)
419 return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000420 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000421#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000422}
423
424/* Set the readline function for TOK to a StreamReader's
425 readline function. The StreamReader is named ENC.
426
427 This function is called from check_bom and check_coding_spec.
428
429 ENC is usually identical to the future value of tok->encoding,
430 except for the (currently unsupported) case of UTF-16.
431
432 Return 1 on success, 0 on failure. */
433
434static int
435fp_setreadl(struct tok_state *tok, const char* enc)
436{
437 PyObject *reader, *stream, *readline;
438
Martin v. Löwis95292d62002-12-11 14:04:59 +0000439 /* XXX: constify filename argument. */
440 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000441 if (stream == NULL)
442 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443
444 reader = PyCodec_StreamReader(enc, stream, NULL);
445 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000446 if (reader == NULL)
447 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000448
449 readline = PyObject_GetAttrString(reader, "readline");
450 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000451 if (readline == NULL)
452 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453
454 tok->decoding_readline = readline;
455 return 1;
456}
457
458/* Fetch the next byte from TOK. */
459
460static int fp_getc(struct tok_state *tok) {
461 return getc(tok->fp);
462}
463
464/* Unfetch the last byte back into TOK. */
465
466static void fp_ungetc(int c, struct tok_state *tok) {
467 ungetc(c, tok->fp);
468}
469
470/* Read a line of input from TOK. Determine encoding
471 if necessary. */
472
473static char *
474decoding_fgets(char *s, int size, struct tok_state *tok)
475{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000476 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000477 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000478 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 if (tok->decoding_state < 0) {
480 /* We already have a codec associated with
481 this input. */
482 line = fp_readl(s, size, tok);
483 break;
484 } else if (tok->decoding_state > 0) {
485 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000486 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 break;
489 } else {
490 /* We have not yet determined the encoding.
491 If an encoding is found, use the file-pointer
492 reader functions from now on. */
493 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
494 return error_ret(tok);
495 assert(tok->decoding_state != 0);
496 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000497 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
499 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
500 return error_ret(tok);
501 }
502 }
503#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000504 /* The default encoding is ASCII, so make sure we don't have any
505 non-ASCII bytes in it. */
506 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000508 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000509 if (*c > 127) {
510 badchar = *c;
511 break;
512 }
513 }
514 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000515 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000516 /* Need to add 1 to the line number, since this line
517 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000518 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000519 "Non-ASCII character '\\x%.2x' "
520 "in file %.200s on line %i, "
521 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000522 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000523 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000524 PyErr_SetString(PyExc_SyntaxError, buf);
525 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526 }
527#endif
528 return line;
529}
530
531static int
532decoding_feof(struct tok_state *tok)
533{
534 if (tok->decoding_state >= 0) {
535 return feof(tok->fp);
536 } else {
537 PyObject* buf = tok->decoding_buffer;
538 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000539 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 if (buf == NULL) {
541 error_ret(tok);
542 return 1;
543 } else {
544 tok->decoding_buffer = buf;
545 }
546 }
547 return PyObject_Length(buf) == 0;
548 }
549}
550
551/* Fetch a byte from TOK, using the string buffer. */
552
Tim Petersc9d78aa2006-03-26 23:27:58 +0000553static int
554buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000555 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556}
557
558/* Unfetch a byte from TOK, using the string buffer. */
559
Tim Petersc9d78aa2006-03-26 23:27:58 +0000560static void
561buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000563 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000564}
565
566/* Set the readline function for TOK to ENC. For the string-based
567 tokenizer, this means to just record the encoding. */
568
Tim Petersc9d78aa2006-03-26 23:27:58 +0000569static int
570buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 tok->enc = enc;
572 return 1;
573}
574
575/* Return a UTF-8 encoding Python string object from the
576 C byte string STR, which is encoded with ENC. */
577
Martin v. Löwis019934b2002-08-07 12:33:18 +0000578#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579static PyObject *
580translate_into_utf8(const char* str, const char* enc) {
581 PyObject *utf8;
582 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
583 if (buf == NULL)
584 return NULL;
585 utf8 = PyUnicode_AsUTF8String(buf);
586 Py_DECREF(buf);
587 return utf8;
588}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000589#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590
Benjamin Petersone36199b2009-11-12 23:39:44 +0000591
592static char *
593translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
594 int skip_next_lf = 0, length = strlen(s), final_length;
595 char *buf, *current;
596 char c;
597 buf = PyMem_MALLOC(length + 2);
598 if (buf == NULL) {
599 tok->done = E_NOMEM;
600 return NULL;
601 }
602 for (current = buf; (c = *s++);) {
603 if (skip_next_lf) {
604 skip_next_lf = 0;
605 if (c == '\n') {
606 c = *s;
607 s++;
608 if (!c)
609 break;
610 }
611 }
612 if (c == '\r') {
613 skip_next_lf = 1;
614 c = '\n';
615 }
616 *current = c;
617 current++;
618 }
619 /* If this is exec input, add a newline to the end of the file if
620 there isn't one already. */
621 if (exec_input && *current != '\n') {
622 *current = '\n';
623 current++;
624 }
625 *current = '\0';
626 final_length = current - buf;
627 if (final_length < length && final_length)
628 /* should never fail */
629 buf = PyMem_REALLOC(buf, final_length + 1);
630 return buf;
631}
632
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000633/* Decode a byte string STR for use as the buffer of TOK.
634 Look for encoding declarations inside STR, and record them
635 inside TOK. */
636
637static const char *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000638decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639{
640 PyObject* utf8 = NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000641 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 const char *s;
Georg Brandl898f1872008-01-21 21:14:21 +0000643 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 int lineno = 0;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000645 tok->input = str = translate_newlines(input, single, tok);
646 if (str == NULL)
647 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 tok->enc = NULL;
649 tok->str = str;
650 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000651 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000653 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000654#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655 if (tok->enc != NULL) {
656 utf8 = translate_into_utf8(str, tok->enc);
657 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000658 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000659 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000661#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662 for (s = str;; s++) {
663 if (*s == '\0') break;
664 else if (*s == '\n') {
Neal Norwitzc44af332008-01-27 17:10:29 +0000665 assert(lineno < 2);
Georg Brandl38d17152008-01-21 18:35:49 +0000666 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667 lineno++;
668 if (lineno == 2) break;
669 }
670 }
671 tok->enc = NULL;
Georg Brandl38d17152008-01-21 18:35:49 +0000672 /* need to check line 1 and 2 separately since check_coding_spec
673 assumes a single line as input */
674 if (newl[0]) {
675 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
676 return error_ret(tok);
677 if (tok->enc == NULL && newl[1]) {
678 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
679 tok, buf_setreadl))
680 return error_ret(tok);
681 }
682 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000683#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000684 if (tok->enc != NULL) {
685 assert(utf8 == NULL);
686 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson08a0bbc2009-06-16 00:29:31 +0000687 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000688 return error_ret(tok);
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000689 str = PyString_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000690 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000691#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000692 assert(tok->decoding_buffer == NULL);
693 tok->decoding_buffer = utf8; /* CAUTION */
694 return str;
695}
696
697#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000698
699/* Set up tokenizer for string */
700
701struct tok_state *
Benjamin Petersone36199b2009-11-12 23:39:44 +0000702PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703{
704 struct tok_state *tok = tok_new();
705 if (tok == NULL)
706 return NULL;
Benjamin Petersone36199b2009-11-12 23:39:44 +0000707 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000708 if (str == NULL) {
709 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000710 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000711 }
712
Martin v. Löwis95292d62002-12-11 14:04:59 +0000713 /* XXX: constify members. */
714 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715 return tok;
716}
717
718
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000719/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720
721struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000722PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723{
724 struct tok_state *tok = tok_new();
725 if (tok == NULL)
726 return NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000727 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000728 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729 return NULL;
730 }
731 tok->cur = tok->inp = tok->buf;
732 tok->end = tok->buf + BUFSIZ;
733 tok->fp = fp;
734 tok->prompt = ps1;
735 tok->nextprompt = ps2;
736 return tok;
737}
738
739
740/* Free a tok_state structure */
741
742void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000743PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000745 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000746 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000747#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748 Py_XDECREF(tok->decoding_readline);
749 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000750#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751 if (tok->fp != NULL && tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000752 PyMem_FREE(tok->buf);
Benjamin Petersone36199b2009-11-12 23:39:44 +0000753 if (tok->input)
754 PyMem_FREE((char *)tok->input);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000755 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756}
757
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000758#if !defined(PGEN) && defined(Py_USING_UNICODE)
759static int
760tok_stdin_decode(struct tok_state *tok, char **inp)
761{
762 PyObject *enc, *sysstdin, *decoded, *utf8;
763 const char *encoding;
764 char *converted;
765
766 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
767 return 0;
768 sysstdin = PySys_GetObject("stdin");
769 if (sysstdin == NULL || !PyFile_Check(sysstdin))
770 return 0;
771
772 enc = ((PyFileObject *)sysstdin)->f_encoding;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000773 if (enc == NULL || !PyString_Check(enc))
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000774 return 0;
775 Py_INCREF(enc);
776
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000777 encoding = PyString_AsString(enc);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000778 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
779 if (decoded == NULL)
780 goto error_clear;
781
782 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
783 Py_DECREF(decoded);
784 if (utf8 == NULL)
785 goto error_clear;
786
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000787 assert(PyString_Check(utf8));
788 converted = new_string(PyString_AS_STRING(utf8),
789 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790 Py_DECREF(utf8);
791 if (converted == NULL)
792 goto error_nomem;
793
Neal Norwitz08062d62006-04-11 08:19:15 +0000794 PyMem_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000795 *inp = converted;
796 if (tok->encoding != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000797 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000798 tok->encoding = new_string(encoding, strlen(encoding));
799 if (tok->encoding == NULL)
800 goto error_nomem;
801
802 Py_DECREF(enc);
803 return 0;
804
805error_nomem:
806 Py_DECREF(enc);
807 tok->done = E_NOMEM;
808 return -1;
809
810error_clear:
811 /* Fallback to iso-8859-1: for backward compatibility */
812 Py_DECREF(enc);
813 PyErr_Clear();
814 return 0;
815}
816#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817
818/* Get next char, updating state; error code goes into tok->done */
819
820static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000821tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000823 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000824 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000825 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000826 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000827 if (tok->done != E_OK)
828 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000829 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000830 char *end = strchr(tok->inp, '\n');
831 if (end != NULL)
832 end++;
833 else {
834 end = strchr(tok->inp, '\0');
835 if (end == tok->inp) {
836 tok->done = E_EOF;
837 return EOF;
838 }
839 }
840 if (tok->start == NULL)
841 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000842 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000843 tok->lineno++;
844 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000845 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847 if (tok->prompt != NULL) {
Anthony Baxter11490022006-04-11 05:39:14 +0000848 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849 if (tok->nextprompt != NULL)
850 tok->prompt = tok->nextprompt;
Anthony Baxter11490022006-04-11 05:39:14 +0000851 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000852 tok->done = E_INTR;
Anthony Baxter11490022006-04-11 05:39:14 +0000853 else if (*newtok == '\0') {
Neal Norwitz08062d62006-04-11 08:19:15 +0000854 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000855 tok->done = E_EOF;
856 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000857#if !defined(PGEN) && defined(Py_USING_UNICODE)
Anthony Baxter11490022006-04-11 05:39:14 +0000858 else if (tok_stdin_decode(tok, &newtok) != 0)
Neal Norwitz08062d62006-04-11 08:19:15 +0000859 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000860#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000861 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000862 size_t start = tok->start - tok->buf;
863 size_t oldlen = tok->cur - tok->buf;
Anthony Baxter11490022006-04-11 05:39:14 +0000864 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000865 char *buf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000866 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000867 tok->lineno++;
868 if (buf == NULL) {
Neal Norwitz08062d62006-04-11 08:19:15 +0000869 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000870 tok->buf = NULL;
Neal Norwitz08062d62006-04-11 08:19:15 +0000871 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000872 tok->done = E_NOMEM;
873 return EOF;
874 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000875 tok->buf = buf;
876 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000877 tok->line_start = tok->cur;
Anthony Baxter11490022006-04-11 05:39:14 +0000878 strcpy(tok->buf + oldlen, newtok);
Neal Norwitz08062d62006-04-11 08:19:15 +0000879 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->inp = tok->buf + newlen;
881 tok->end = tok->inp + 1;
882 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000883 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000884 else {
885 tok->lineno++;
886 if (tok->buf != NULL)
Neal Norwitz08062d62006-04-11 08:19:15 +0000887 PyMem_FREE(tok->buf);
Anthony Baxter11490022006-04-11 05:39:14 +0000888 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000889 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000891 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 tok->inp = strchr(tok->buf, '\0');
893 tok->end = tok->inp + 1;
894 }
895 }
896 else {
897 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000898 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000899 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000900 if (tok->start == NULL) {
901 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000902 tok->buf = (char *)
Neal Norwitz08062d62006-04-11 08:19:15 +0000903 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 if (tok->buf == NULL) {
905 tok->done = E_NOMEM;
906 return EOF;
907 }
908 tok->end = tok->buf + BUFSIZ;
909 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000910 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
911 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000912 tok->done = E_EOF;
913 done = 1;
914 }
915 else {
916 tok->done = E_OK;
917 tok->inp = strchr(tok->buf, '\0');
918 done = tok->inp[-1] == '\n';
919 }
920 }
921 else {
922 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000923 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000924 tok->done = E_EOF;
925 done = 1;
926 }
927 else
928 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000929 }
930 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000931 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000932 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000933 Py_ssize_t curstart = tok->start == NULL ? -1 :
934 tok->start - tok->buf;
935 Py_ssize_t curvalid = tok->inp - tok->buf;
936 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000937 char *newbuf = tok->buf;
Neal Norwitz08062d62006-04-11 08:19:15 +0000938 newbuf = (char *)PyMem_REALLOC(newbuf,
939 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000940 if (newbuf == NULL) {
941 tok->done = E_NOMEM;
942 tok->cur = tok->inp;
943 return EOF;
944 }
945 tok->buf = newbuf;
946 tok->inp = tok->buf + curvalid;
947 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000948 tok->start = curstart < 0 ? NULL :
949 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000950 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000951 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000952 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000953 /* Break out early on decoding
954 errors, as tok->buf will be NULL
955 */
956 if (tok->decoding_erred)
957 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000958 /* Last line does not end in \n,
959 fake one */
960 strcpy(tok->inp, "\n");
961 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000962 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000963 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000964 }
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000965 if (tok->buf != NULL) {
966 tok->cur = tok->buf + cur;
967 tok->line_start = tok->cur;
968 /* replace "\r\n" with "\n" */
Andrew M. Kuchling9b3a8242006-10-06 18:51:55 +0000969 /* For Mac leave the \r, giving a syntax error */
Neal Norwitzd21a7ff2006-06-02 06:23:00 +0000970 pt = tok->inp - 2;
971 if (pt >= tok->buf && *pt == '\r') {
972 *pt++ = '\n';
973 *pt = '\0';
974 tok->inp = pt;
975 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000976 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000977 }
978 if (tok->done != E_OK) {
979 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000980 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000981 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000982 return EOF;
983 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000984 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000985 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000986}
987
988
989/* Back-up one character */
990
991static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000992tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000993{
994 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000995 if (--tok->cur < tok->buf)
Benjamin Petersone3383b82009-11-07 01:04:38 +0000996 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000997 if (*tok->cur != c)
998 *tok->cur = c;
999 }
1000}
1001
1002
1003/* Return the token corresponding to a single character */
1004
1005int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001006PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001007{
1008 switch (c) {
1009 case '(': return LPAR;
1010 case ')': return RPAR;
1011 case '[': return LSQB;
1012 case ']': return RSQB;
1013 case ':': return COLON;
1014 case ',': return COMMA;
1015 case ';': return SEMI;
1016 case '+': return PLUS;
1017 case '-': return MINUS;
1018 case '*': return STAR;
1019 case '/': return SLASH;
1020 case '|': return VBAR;
1021 case '&': return AMPER;
1022 case '<': return LESS;
1023 case '>': return GREATER;
1024 case '=': return EQUAL;
1025 case '.': return DOT;
1026 case '%': return PERCENT;
1027 case '`': return BACKQUOTE;
1028 case '{': return LBRACE;
1029 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001030 case '^': return CIRCUMFLEX;
1031 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001032 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001033 default: return OP;
1034 }
1035}
1036
1037
Guido van Rossumfbab9051991-10-20 20:25:03 +00001038int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001039PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001040{
1041 switch (c1) {
1042 case '=':
1043 switch (c2) {
1044 case '=': return EQEQUAL;
1045 }
1046 break;
1047 case '!':
1048 switch (c2) {
1049 case '=': return NOTEQUAL;
1050 }
1051 break;
1052 case '<':
1053 switch (c2) {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001054 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001055 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001056 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001057 }
1058 break;
1059 case '>':
1060 switch (c2) {
1061 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001062 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001063 }
1064 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001065 case '+':
1066 switch (c2) {
1067 case '=': return PLUSEQUAL;
1068 }
1069 break;
1070 case '-':
1071 switch (c2) {
1072 case '=': return MINEQUAL;
1073 }
1074 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001075 case '*':
1076 switch (c2) {
1077 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001078 case '=': return STAREQUAL;
1079 }
1080 break;
1081 case '/':
1082 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001083 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001084 case '=': return SLASHEQUAL;
1085 }
1086 break;
1087 case '|':
1088 switch (c2) {
1089 case '=': return VBAREQUAL;
1090 }
1091 break;
1092 case '%':
1093 switch (c2) {
1094 case '=': return PERCENTEQUAL;
1095 }
1096 break;
1097 case '&':
1098 switch (c2) {
1099 case '=': return AMPEREQUAL;
1100 }
1101 break;
1102 case '^':
1103 switch (c2) {
1104 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001105 }
1106 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001107 }
1108 return OP;
1109}
1110
Thomas Wouters434d0822000-08-24 20:11:32 +00001111int
1112PyToken_ThreeChars(int c1, int c2, int c3)
1113{
1114 switch (c1) {
1115 case '<':
1116 switch (c2) {
1117 case '<':
1118 switch (c3) {
1119 case '=':
1120 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001121 }
1122 break;
1123 }
1124 break;
1125 case '>':
1126 switch (c2) {
1127 case '>':
1128 switch (c3) {
1129 case '=':
1130 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001131 }
1132 break;
1133 }
1134 break;
1135 case '*':
1136 switch (c2) {
1137 case '*':
1138 switch (c3) {
1139 case '=':
1140 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001141 }
1142 break;
1143 }
1144 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001145 case '/':
1146 switch (c2) {
1147 case '/':
1148 switch (c3) {
1149 case '=':
1150 return DOUBLESLASHEQUAL;
1151 }
1152 break;
1153 }
1154 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001155 }
1156 return OP;
1157}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001158
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001160indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001161{
1162 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001163 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001164 tok->cur = tok->inp;
1165 return 1;
1166 }
1167 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001168 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1169 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170 tok->altwarning = 0;
1171 }
1172 return 0;
1173}
1174
1175
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176/* Get next token, after space stripping etc. */
1177
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001178static int
1179tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180{
1181 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001182 int blankline;
1183
Andrew M. Kuchling110a48c2008-08-05 02:05:23 +00001184 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001185 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001186 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001187 blankline = 0;
1188
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 /* Get indentation level */
1190 if (tok->atbol) {
1191 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001192 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 for (;;) {
1195 c = tok_nextc(tok);
1196 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001197 col++, altcol++;
1198 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001200 altcol = (altcol/tok->alttabsize + 1)
1201 * tok->alttabsize;
1202 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001203 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001204 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 else
1206 break;
1207 }
1208 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001209 if (c == '#' || c == '\n') {
1210 /* Lines with only whitespace and/or comments
1211 shouldn't affect the indentation and are
1212 not passed to the parser as NEWLINE tokens,
1213 except *totally* empty lines in interactive
1214 mode, which signal the end of a command group. */
1215 if (col == 0 && c == '\n' && tok->prompt != NULL)
1216 blankline = 0; /* Let it through */
1217 else
1218 blankline = 1; /* Ignore completely */
1219 /* We can't jump back right here since we still
1220 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001222 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001223 if (col == tok->indstack[tok->indent]) {
1224 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001225 if (altcol != tok->altindstack[tok->indent]) {
1226 if (indenterror(tok))
1227 return ERRORTOKEN;
1228 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001230 else if (col > tok->indstack[tok->indent]) {
1231 /* Indent -- always one */
1232 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001233 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001234 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001235 return ERRORTOKEN;
1236 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001237 if (altcol <= tok->altindstack[tok->indent]) {
1238 if (indenterror(tok))
1239 return ERRORTOKEN;
1240 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001241 tok->pendin++;
1242 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001243 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001245 else /* col < tok->indstack[tok->indent] */ {
1246 /* Dedent -- any number, must be consistent */
1247 while (tok->indent > 0 &&
1248 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001249 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001250 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001251 }
1252 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001253 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001254 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001255 return ERRORTOKEN;
1256 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001257 if (altcol != tok->altindstack[tok->indent]) {
1258 if (indenterror(tok))
1259 return ERRORTOKEN;
1260 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 }
1262 }
1263 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001264
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001265 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001266
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 /* Return pending indents/dedents */
1268 if (tok->pendin != 0) {
1269 if (tok->pendin < 0) {
1270 tok->pendin++;
1271 return DEDENT;
1272 }
1273 else {
1274 tok->pendin--;
1275 return INDENT;
1276 }
1277 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001278
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001280 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 /* Skip spaces */
1282 do {
1283 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001284 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001285
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001287 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001288
Guido van Rossumab5ca152000-03-31 00:52:27 +00001289 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001291 static char *tabforms[] = {
1292 "tab-width:", /* Emacs */
1293 ":tabstop=", /* vim, full form */
1294 ":ts=", /* vim, abbreviated form */
1295 "set tabsize=", /* will vi never die? */
1296 /* more templates can be added here to support other editors */
1297 };
1298 char cbuf[80];
1299 char *tp, **cp;
1300 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001302 *tp++ = c = tok_nextc(tok);
1303 } while (c != EOF && c != '\n' &&
Neal Norwitz71e05f12006-06-12 02:07:57 +00001304 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
Guido van Rossumab5ca152000-03-31 00:52:27 +00001305 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001306 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001307 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1308 cp++) {
1309 if ((tp = strstr(cbuf, *cp))) {
1310 int newsize = atoi(tp + strlen(*cp));
1311
1312 if (newsize >= 1 && newsize <= 40) {
1313 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001314 if (Py_VerboseFlag)
1315 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001316 "Tab size set to %d\n",
1317 newsize);
1318 }
1319 }
1320 }
1321 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001323 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001324
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001325 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001326 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001328 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001329
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001330 /* Identifier (most frequent token!) */
1331 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001332 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001333 switch (c) {
Christian Heimes288e89a2008-01-18 18:24:07 +00001334 case 'b':
1335 case 'B':
1336 c = tok_nextc(tok);
1337 if (c == 'r' || c == 'R')
1338 c = tok_nextc(tok);
1339 if (c == '"' || c == '\'')
1340 goto letter_quote;
1341 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001342 case 'r':
1343 case 'R':
1344 c = tok_nextc(tok);
1345 if (c == '"' || c == '\'')
1346 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001347 break;
1348 case 'u':
1349 case 'U':
1350 c = tok_nextc(tok);
1351 if (c == 'r' || c == 'R')
1352 c = tok_nextc(tok);
1353 if (c == '"' || c == '\'')
1354 goto letter_quote;
1355 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001356 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001357 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001359 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001361 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 *p_end = tok->cur;
1363 return NAME;
1364 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001365
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 /* Newline */
1367 if (c == '\n') {
1368 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001369 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001370 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001371 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001373 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 return NEWLINE;
1375 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001376
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001377 /* Period or number starting with period? */
1378 if (c == '.') {
1379 c = tok_nextc(tok);
1380 if (isdigit(c)) {
1381 goto fraction;
1382 }
1383 else {
1384 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001386 *p_end = tok->cur;
1387 return DOT;
1388 }
1389 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001390
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 /* Number */
1392 if (isdigit(c)) {
1393 if (c == '0') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001394 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 c = tok_nextc(tok);
1396 if (c == '.')
1397 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001398#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001399 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001400 goto imaginary;
1401#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 if (c == 'x' || c == 'X') {
Georg Brandl14404b62008-01-19 19:27:05 +00001403
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 /* Hex */
Georg Brandl14404b62008-01-19 19:27:05 +00001405 c = tok_nextc(tok);
1406 if (!isxdigit(c)) {
1407 tok->done = E_TOKEN;
1408 tok_backup(tok, c);
1409 return ERRORTOKEN;
1410 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 do {
1412 c = tok_nextc(tok);
1413 } while (isxdigit(c));
1414 }
Eric Smith9ff19b52008-03-17 17:32:20 +00001415 else if (c == 'o' || c == 'O') {
1416 /* Octal */
1417 c = tok_nextc(tok);
Amaury Forgeot d'Arc52167212008-04-24 18:07:05 +00001418 if (c < '0' || c >= '8') {
Eric Smith9ff19b52008-03-17 17:32:20 +00001419 tok->done = E_TOKEN;
1420 tok_backup(tok, c);
1421 return ERRORTOKEN;
1422 }
1423 do {
1424 c = tok_nextc(tok);
1425 } while ('0' <= c && c < '8');
1426 }
1427 else if (c == 'b' || c == 'B') {
1428 /* Binary */
1429 c = tok_nextc(tok);
1430 if (c != '0' && c != '1') {
1431 tok->done = E_TOKEN;
1432 tok_backup(tok, c);
1433 return ERRORTOKEN;
1434 }
1435 do {
1436 c = tok_nextc(tok);
1437 } while (c == '0' || c == '1');
1438 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001439 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001440 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 /* Octal; c is first char of it */
1442 /* There's no 'isoctdigit' macro, sigh */
1443 while ('0' <= c && c < '8') {
1444 c = tok_nextc(tok);
1445 }
Tim Petersd507dab2001-08-30 20:51:59 +00001446 if (isdigit(c)) {
1447 found_decimal = 1;
1448 do {
1449 c = tok_nextc(tok);
1450 } while (isdigit(c));
1451 }
1452 if (c == '.')
1453 goto fraction;
1454 else if (c == 'e' || c == 'E')
1455 goto exponent;
1456#ifndef WITHOUT_COMPLEX
1457 else if (c == 'j' || c == 'J')
1458 goto imaginary;
1459#endif
1460 else if (found_decimal) {
1461 tok->done = E_TOKEN;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001465 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001466 if (c == 'l' || c == 'L')
1467 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001468 }
1469 else {
1470 /* Decimal */
1471 do {
1472 c = tok_nextc(tok);
1473 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001474 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001475 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001476 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001477 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001478 if (c == '.') {
1479 fraction:
1480 /* Fraction */
1481 do {
1482 c = tok_nextc(tok);
1483 } while (isdigit(c));
1484 }
1485 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001486 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001487 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001488 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001489 if (c == '+' || c == '-')
1490 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001491 if (!isdigit(c)) {
1492 tok->done = E_TOKEN;
1493 tok_backup(tok, c);
1494 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001495 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001496 do {
1497 c = tok_nextc(tok);
1498 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001499 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001500#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001501 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001502 /* Imaginary part */
1503 imaginary:
1504 c = tok_nextc(tok);
1505#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001506 }
1507 }
1508 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001509 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001510 *p_end = tok->cur;
1511 return NUMBER;
1512 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001513
1514 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001515 /* String */
1516 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001517 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001518 int quote = c;
1519 int triple = 0;
1520 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 for (;;) {
1522 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001523 if (c == '\n') {
1524 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001525 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001526 tok_backup(tok, c);
1527 return ERRORTOKEN;
1528 }
1529 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001530 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001531 }
1532 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001533 if (triple)
1534 tok->done = E_EOFS;
1535 else
1536 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001537 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001538 return ERRORTOKEN;
1539 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001540 else if (c == quote) {
1541 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001542 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001543 c = tok_nextc(tok);
1544 if (c == quote) {
1545 triple = 1;
1546 tripcount = 0;
1547 continue;
1548 }
1549 tok_backup(tok, c);
1550 }
1551 if (!triple || tripcount == 3)
1552 break;
1553 }
1554 else if (c == '\\') {
1555 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001556 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001557 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001558 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001559 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001560 return ERRORTOKEN;
1561 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001562 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001563 else
1564 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001566 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001567 *p_end = tok->cur;
1568 return STRING;
1569 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001570
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001571 /* Line continuation */
1572 if (c == '\\') {
1573 c = tok_nextc(tok);
1574 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001575 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001576 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001577 return ERRORTOKEN;
1578 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001579 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001580 goto again; /* Read next line */
1581 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001582
Guido van Rossumfbab9051991-10-20 20:25:03 +00001583 /* Check for two-character token */
1584 {
1585 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001586 int token = PyToken_TwoChars(c, c2);
Christian Heimes02c9ab52007-11-23 12:12:02 +00001587#ifndef PGEN
Amaury Forgeot d'Arc6dae85f2007-11-24 13:20:22 +00001588 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
Christian Heimes02c9ab52007-11-23 12:12:02 +00001589 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
Georg Brandld5b635f2008-03-25 08:29:14 +00001590 "<> not supported in 3.x; use !=",
Christian Heimes02c9ab52007-11-23 12:12:02 +00001591 tok->filename, tok->lineno,
1592 NULL, NULL)) {
1593 return ERRORTOKEN;
1594 }
1595 }
1596#endif
Guido van Rossumfbab9051991-10-20 20:25:03 +00001597 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001598 int c3 = tok_nextc(tok);
1599 int token3 = PyToken_ThreeChars(c, c2, c3);
1600 if (token3 != OP) {
1601 token = token3;
1602 } else {
1603 tok_backup(tok, c3);
1604 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001605 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001606 *p_end = tok->cur;
1607 return token;
1608 }
1609 tok_backup(tok, c2);
1610 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001611
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001612 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001613 switch (c) {
1614 case '(':
1615 case '[':
1616 case '{':
1617 tok->level++;
1618 break;
1619 case ')':
1620 case ']':
1621 case '}':
1622 tok->level--;
1623 break;
1624 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001625
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001626 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001627 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001628 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001629 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001630}
1631
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001632int
1633PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1634{
1635 int result = tok_get(tok, p_start, p_end);
1636 if (tok->decoding_erred) {
1637 result = ERRORTOKEN;
1638 tok->done = E_DECODE;
1639 }
1640 return result;
1641}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001642
Martin v. Löwisa5136192007-09-04 14:19:28 +00001643/* This function is only called from parsetok. However, it cannot live
1644 there, as it must be empty for PGEN, and we can check for PGEN only
1645 in this file. */
1646
Christian Heimes082c9b02008-01-23 14:20:50 +00001647#if defined(PGEN) || !defined(Py_USING_UNICODE)
Martin v. Löwisa5136192007-09-04 14:19:28 +00001648char*
1649PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1650{
1651 return NULL;
1652}
1653#else
Georg Brandl76b30d12008-01-07 18:41:34 +00001654#ifdef Py_USING_UNICODE
Martin v. Löwisa5136192007-09-04 14:19:28 +00001655static PyObject *
1656dec_utf8(const char *enc, const char *text, size_t len) {
1657 PyObject *ret = NULL;
1658 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1659 if (unicode_text) {
1660 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1661 Py_DECREF(unicode_text);
1662 }
1663 if (!ret) {
Guido van Rossum9fc1b962007-10-15 15:54:11 +00001664 PyErr_Clear();
Martin v. Löwisa5136192007-09-04 14:19:28 +00001665 }
1666 return ret;
1667}
Martin v. Löwisa5136192007-09-04 14:19:28 +00001668char *
1669PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1670{
1671 char *text = NULL;
1672 if (tok->encoding) {
1673 /* convert source to original encondig */
1674 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1675 if (lineobj != NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001676 int linelen = PyString_Size(lineobj);
1677 const char *line = PyString_AsString(lineobj);
Martin v. Löwisa5136192007-09-04 14:19:28 +00001678 text = PyObject_MALLOC(linelen + 1);
1679 if (text != NULL && line != NULL) {
1680 if (linelen)
1681 strncpy(text, line, linelen);
1682 text[linelen] = '\0';
1683 }
1684 Py_DECREF(lineobj);
1685
1686 /* adjust error offset */
1687 if (*offset > 1) {
1688 PyObject *offsetobj = dec_utf8(tok->encoding,
1689 tok->buf, *offset-1);
1690 if (offsetobj) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001691 *offset = PyString_Size(offsetobj) + 1;
Martin v. Löwisa5136192007-09-04 14:19:28 +00001692 Py_DECREF(offsetobj);
1693 }
1694 }
1695
1696 }
1697 }
1698 return text;
1699
1700}
Georg Brandl76b30d12008-01-07 18:41:34 +00001701#endif /* defined(Py_USING_UNICODE) */
Martin v. Löwisa5136192007-09-04 14:19:28 +00001702#endif
1703
Martin v. Löwisa5136192007-09-04 14:19:28 +00001704
Guido van Rossum408027e1996-12-30 16:17:54 +00001705#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001706
1707void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001708tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001709{
Guido van Rossum86bea461997-04-29 21:03:06 +00001710 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001711 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1712 printf("(%.*s)", (int)(end - start), start);
1713}
1714
1715#endif